Merge branch 'master' of gitlab.c3sl.ufpr.br:portalmec/portalmec into user-profiles

bbccc139 · Marcela Ribeiro de Oliveira · cbf39f97 · 3953e4bc · bbccc139
Commit bbccc139 authored 7 years ago by Marcela Ribeiro de Oliveira
--- a/lib/tasks/tag_clustering.rake
+++ b/lib/tasks/tag_clustering.rake
+require 'json'
+namespace :tag do
+  desc 'Generate tag clusters'
+  task :generate_clusters, [:tag_name, :n_los, :infomap] => [:environment] do |_t, args|
+
+    args.with_defaults(n_los: 50, infomap: false)
+
+    if args.tag_name.blank?
+      puts "\nERROR: You must pass a tag name as argument."
+      puts "Example usage for searchig with the tag test, result with 50 learning objects and generating a new cluster from infomap:"
+      puts "$ rails tag:generate_clusters[test,50,true]"
+      abort
+    end
+
+    # searched tag
+    $TAG = Tag.find_by_name(args.tag_name)
+
+    if $TAG.blank?
+      abort("\nERROR: Tag #{args.tag_name} not found.")
+    end
+
+    # Padding on tag distance normalization
+    $PAD = 1 # if == 1, min tag dist will be 0.5  (1 - (max/max*(1+$PAD)))
+
+    # Padding on merge min maxing
+    $PADM = 0.05
+
+    # number of results to return
+    $N_LOS = args.n_los.to_i
+
+    outDIR = "tmp"
+    fileName = "tags"
+    path = Rails.root.join(outDIR, fileName).to_s
+
+    if !File.file?(path+".ftree") || args.infomap
+      graphPath = path+".net"
+      hash = create_hash()
+      create_pajek_net_graph(hash, graphPath)
+
+      puts "\nCalling infomap with the pajek graph\n\n"
+      system("infomap --ftree #{graphPath} #{Rails.root.join(outDIR)}")
+    end
+
+    # create cluster tree from ftree
+    tags = parseTree(path)
+
+    puts "\nStarted Calculating relevant results for tag #{$TAG.name}\n"
+
+    relevant_los = find_relevant_results($TAG.id, tags)
+
+    puts "\n============ Learning Objects - Cluster Search ===============\n"
+    puts "ID: Ranking | Name | Tags\n\n"
+    relevant_los.each do |id, rank|
+      lo = LearningObject.find(id)
+      puts "-----"
+      p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name)
+      lo.tags.each {|t| print t.name+" | "}
+      puts ""
+    end
+
+    search_los = LearningObject.search $TAG.name, limit: $N_LOS, explain:true
+
+    # print  search_los.response["hits"]["hits"].to_json
+    # p wdfSearch("Educação Infatil")
+    # p tagWdfSorted()
+
+    puts "\n============ Learning Objects - Elasticsearch ===============\n"
+    search_los.each do |lo|
+      puts "#{lo.id}: #{lo.search_hit['_score']}"
+    end
+    puts "ID: Ranking | Name | Tags\n\n"
+    search_los.each do |lo|
+      puts "-----"
+      p lo.id.to_s+": "+lo.search_hit["_score"].to_s+" | "+lo.try(:name)
+      lo.tags.each {|t| print t.name+" | "}
+      puts ""
+    end
+
+    merged_los = merge(search_los, relevant_los)
+
+    puts "\n============ Learning Objects - Merged ===============\n"
+    merged_los.each do |id, rank|
+      puts "#{id}: #{rank}"
+    end
+    puts "ID: Ranking | Name | Tags\n\n"
+    merged_los.each do |id, rank|
+      lo = LearningObject.find(id)
+      puts "-----"
+      p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name)
+      lo.tags.each {|t| print t.name+" | "}
+      puts ""
+    end
+
+  end # task
+
+  private
+
+  def tagWdfSorted()
+    tagWdf = []
+
+    Tag.all.each_with_index do |t,i|
+      df = wdfSearch(t.name)
+      print "ERROR #{t.name}\n" if df == -1
+      tagWdf << [t.id, df, t.name] if df != -1
+    end
+
+    tagWdf = tagWdf.sort_by { |t| t[1] }
+
+    return tagWdf
+  end
+
+  def findScores(obj)
+    if !obj["description"].match(/weight\(\w*\.word_start:/).nil?
+      if obj["details"][0]["details"][0]["description "] == "boost" && obj["details"][0]["details"][0]["value"] == 10
+        if obj["details"][0]["details"][1]["details"][0]["description"] == "docFreq"
+          return obj["details"][0]["details"][1]["details"][0]["value"]
+        else
+          return 0
+        end
+      else
+        return 0
+      end
+    end
+    sum = 0
+    obj["details"].each do |detail|
+      sum += findScores(detail)
+    end
+    return sum
+  end
+
+  def wdfSearch(tname)
+    search_los = LearningObject.search tname, limit: 1, explain:true
+    return -1 if search_los.blank?
+    details = search_los.response['hits']['hits'][0]['_explanation']
+
+    sum = findScores(details)
+    return sum
+  end
+
+  def merge(search_los, relevant_los)
+    merged_los = []
+
+    max = search_los.first.search_hit['_score']
+    min = search_los[search_los.size-1].search_hit['_score']
+    # min = 0
+    max_boost = 0
+    response = search_los.response['hits']['hits']
+    search_los.each_with_index do |slo, i|
+      detail = response[i]['_explanation']['details'][0]
+      while detail['description'] != "boost"
+        detail = detail['details'][0]
+      end
+      boost = detail['value']
+      max_boost = boost if boost > max_boost
+
+      slo.search_hit['_score'] = boost*(slo.search_hit['_score']*(1+$PADM)-min)/(max-min)
+    end
+
+    max = relevant_los.first[1]
+    min = relevant_los.last[1]
+
+    relevant_los.each do |rlo|
+      rlo[1] = max_boost*(rlo[1]*(1+$PADM)-min)/(max-min)
+    end
+
+    search_los.each do |slo|
+      relevant_los.each_with_index do |rlo, index|
+        if slo.id == rlo[0]
+          slo.search_hit['_score'] = slo.search_hit['_score'] + rlo[1]
+          relevant_los.delete_at(index)
+        end
+      end
+      merged_los << [slo.id, slo.search_hit['_score']]
+    end
+
+    merged_los.push(*relevant_los)
+    merged_los = merged_los.sort_by { |lo| lo[1]*-1 }
+    return merged_los.first(50)
+  end
+
+  # hash[id1][id2] will equal how many times tags with id1 and id2 appear together on a LO
+  def create_hash()
+    puts "\nCreating hash of tag concurrences\n"
+    hash = {}
+
+    LearningObject.all.each do |lo|
+      # for each lo, count tags and tag pairs and add to hash
+      # if id1 <= id2
+      lo.tags.each.with_index do |t, i|
+        # initialize value
+        hash[t.id] = {} if hash[t.id].nil?
+        hash[t.id][t.id] = 0 if hash[t.id][t.id].nil?
+
+        hash[t.id][t.id] += 1
+
+        # for each next tags (with higher index)
+        lo.tags.drop(i+1).each do |t2|
+          # [t1][t2], t1 should always be lower
+          if t.id > t2.id
+            # swaps
+            t, t2 = t2, t
+            # check nil
+            hash[t.id] = {} if hash[t.id].nil?
+          end
+
+          # initialize value
+          if hash[t.id][t2.id].nil?
+            hash[t.id][t2.id] = 0
+          end
+
+          hash[t.id][t2.id] += 1
+        end
+      end
+    end
+
+    return hash
+  end
+
+
+  def create_pajek_net_graph(hash, path)
+    puts "\nCreating pajek net graph on path #{path}\n"
+    File.open(path, "w+") do |f|
+      f << "*Vertices #{Tag.all.size}\n"
+
+      tag_index = {}
+      Tag.all.each_with_index do |t,i|
+        f << "#{i+1} \"#{t.name}\"\n"
+        tag_index[t.id] = i+1
+      end
+
+      f << "*Edges\n"
+
+      hash.each do |id1, ids2Hash|
+        ids2Hash.each do |id2, value|
+          if id1 != id2
+            f << "#{tag_index[id1]} #{tag_index[id2]} \
+            #{hash[id1][id2].to_f/(Math.sqrt(hash[id1][id1]*hash[id2][id2]))}\n"
+            # weight of the edge is the cos distance
+          end
+        end
+      end
+    end
+  end
+
+  # ranking #
+
+  def ranked_close_tags(tagId, tags)
+    close_tags = []
+
+    tags[tagId][:parent][:childs].each do |t1|
+      # calculate logarithmic distance between tag flows
+      # lower value, closer, more relevant
+      # the tag you are searching for will be at distance 0 of itself
+      lg_dist = ( Math.log2(tags[tagId][:flow]) - Math.log2(t1[:flow]) ).abs
+
+      close_tags << { id: t1[:id], rank: lg_dist}
+    end
+
+    return normalize_complement_close(close_tags)
+  end
+
+  # normalizes and complements, 0 distance will be 1,
+  # max dist will be closest to 0
+  def normalize_complement_close(tags)
+    max = 0
+
+    # find max rank
+    tags.each do |t|
+      max = t[:rank] if t[:rank] > max
+    end
+
+    # normalize, min will always be 0
+    tags.each do |t|
+      # increase max by $PAD so its rank isn't 0
+      t[:rank] = 1 - (t[:rank]/(max*(1 + $PAD)))
+    end
+
+    return tags
+  end
+
+  def find_relevant_results(tagId, tags)
+    los_ranked = {}
+
+    puts "\nGetting tags from the same cluster\n"
+    puts "Normalization with padding = #{$PAD}\n"
+    close_tags = ranked_close_tags(tagId, tags)
+
+    # Uncomment the next line if you want to sort by global tag frequency
+    # freq_cluster = cluster_frequency(close_tags)
+
+    puts "\nStarted Ranking LOs...\n"
+
+    puts "\n====== Close Tags =========\n"
+    puts "Name | ID | Normalized Ranking\n\n"
+    close_tags.each do |ct|
+      tag = Tag.find(ct[:id])
+      p tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tags[ct[:id]][:rank].to_s
+      tag.taggings.where(taggable_type: "LearningObject").each do |tagging|
+        lo = tagging.taggable
+        if los_ranked[lo.id].nil?
+          # Uncomment the next line if you want to sort by local tag frequency
+          # los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags)
+
+          # Uncomment the next line if you want to sort by global tag frequency
+          # los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster)
+
+          # Uncomment the next line if you want to sort by tag cluster rank
+          los_ranked[lo.id] = relevance_raw_rank(lo, close_tags)
+        end
+      end
+    end
+    puts "============\n"
+
+    puts "\nSorting LOs...\n"
+    # sorts by its ranking
+    los_ranked = los_ranked.sort_by { |id, rank| rank }
+    # get highest ranks
+    los_ranked = los_ranked.reverse.first($N_LOS)
+
+    los_ranked.each do |key, value|
+      puts "#{key}: #{value}"
+    end
+
+    return los_ranked
+  end
+
+  def relevance_frequency_rank(lo, close_tags)
+    itf_sum = 0
+
+    wdf = 0
+    wdf = 1/(Math.log2(lo.tags.size)+1) if lo.tags.size != 0
+
+    lo.tags.each do |t|
+      close_tags.each do |tag|
+        if tag[:id] == t.id
+          itf_sum += tag[:rank]*(Math.log2(Tag.all.size/t.taggings.size)+1)
+        end
+      end
+    end
+
+    return wdf*itf_sum
+  end
+
+  # returns the sum of how many times each tag in cluster appears in space
+  def cluster_frequency(cluster)
+    freq_cluster = 0
+    cluster.each do |t|
+      freq_cluster += Tag.find(t[:id]).taggings.size
+    end
+    return freq_cluster
+  end
+
+  def relevance_frequency_rank_global(lo, close_tags, freq_cluster)
+    # for each tag in LO that is in the cluster, accumulate it's rank
+    rank_sum = 1
+    lo.tags.each do |t|
+      close_tags.each do |tag|
+        if tag[:id] == t.id
+          rank_sum += tag[:rank]
+        end
+      end
+    end
+
+    wdf = 0
+    wdf = (Math.log2(rank_sum)/(Math.log2(lo.tags.size)+1)) if lo.tags.size != 0
+
+    itf = Math.log2(Tag.all.size/freq_cluster)+1
+
+    return wdf*itf
+  end
+
+  # returns the rank sum of the tags in the LO
+  def relevance_raw_rank(lo, close_tags)
+    # for each tag in LO that is in the cluster, accumulate it's rank
+    rank_sum = 0
+    lo.tags.each do |t|
+      close_tags.each do |tag|
+        if tag[:id] == t.id
+          rank_sum += tag[:rank]
+        end
+      end
+    end
+    return rank_sum
+  end
+
+  def parseTree(path)
+    # parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes
+    # create tags list, tags[tagId] == the tag leaf inside the cluster tree
+
+    puts "\nParsing .ftree output into a 'tree'\n"
+
+    clusters = {childs: [], parent: nil}
+    tags = {}
+    countClusters = {}
+
+    File.open(path+".ftree", "r") do |f|
+      f.gets
+      f.gets
+
+      while line = f.gets
+        break if !line.include? ':'
+
+        tmp = line.split(' ')
+        # tmp[0] format like: 2:7:5:4, 2 an 7 have clusters as childs, 5 has leafs, 4 is one of them (tag)
+        # get id of each level of the tree
+        ftree = tmp[0].split(':')[0..-2]
+        # last number of the sequence is the leaf Id
+        leafId = tmp[0].split(':')[-1].to_i
+
+        # last number on the line is the tag Id
+        tagId = tmp[-1].to_i
+        # second number on the line
+        flow = tmp[1].to_f
+        # between the third and second to last is where the name lies
+        name = tmp[2..-2].join(' ')[1..-2]
+
+        # iterate through the levels of the tree
+        it = clusters # start at the root
+        ftree.each do |clusterId|
+          clusterId = clusterId.to_i - 1 # on the file they start at 1, we want 0
+          if it[:childs][clusterId].nil? # if this id doesn't exist, create it as child of 'it'
+            it[:childs][clusterId] = {childs: [], parent: nil}
+            it[:childs][clusterId][:parent] = it
+          end
+          # go down a level
+          it = it[:childs][clusterId]
+        end
+        countClusters[it] = 1 # set this cluster in this hash, for counting purposes
+        # 'it' is the cluster leafId is a child of, so add it
+        it[:childs][leafId-1] = {id: tagId, flow: flow, name: name, parent: it}
+        # put the leaf on this hash for easy acess by the tagId
+        tags[tagId] = it[:childs][leafId-1]
+      end
+    end
+
+    puts "\nNumber of clusters found: #{countClusters.size}\n"
+    return tags
+  end
+
+end # namespace