From 6505ee6850f6d57e5b491619f1b4abe54c80708f Mon Sep 17 00:00:00 2001
From: bfs15 <bruno.serbena@gmail.com>
Date: Fri, 7 Jul 2017 11:24:47 -0300
Subject: [PATCH] tag clustering task now only creates .ftree

---
 lib/tasks/tag_clustering.rake | 380 +---------------------------------
 1 file changed, 10 insertions(+), 370 deletions(-)

diff --git a/lib/tasks/tag_clustering.rake b/lib/tasks/tag_clustering.rake
index c26721a5..bb024434 100644
--- a/lib/tasks/tag_clustering.rake
+++ b/lib/tasks/tag_clustering.rake
@@ -19,187 +19,24 @@
 require 'json'
 namespace :tag do
   desc 'Generate tag clusters'
-  task :generate_clusters, [:tag_name, :n_los, :infomap] => [:environment] do |_t, args|
+  task :generate_clusters => [:environment] do
+    # DEBUG = true
 
-    args.with_defaults(n_los: 50, infomap: false)
+    graph_path = TagSearchService.file_path+".net"
+    hash = create_hash()
+    create_pajek_net_graph(hash, graph_path)
 
-    if args.tag_name.blank?
-      puts "\nERROR: You must pass a tag name as argument."
-      puts "Example usage for searchig with the tag test, result with 50 learning objects and generating a new cluster from infomap:"
-      puts "$ rails tag:generate_clusters[test,50,true]"
-      abort
-    end
-
-    # searched tag
-    $TAG = Tag.find_by_name(args.tag_name)
-
-    if $TAG.blank?
-      abort("\nERROR: Tag #{args.tag_name} not found.")
-    end
-
-    # Padding on tag distance normalization
-    $PAD = 1 # if == 1, min tag dist will be 0.5  (1 - (max/max*(1+$PAD)))
-
-    # Padding on merge min maxing
-    $PADM = 0.05
-
-    # number of results to return
-    $N_LOS = args.n_los.to_i
-
-    outDIR = "tmp"
-    fileName = "tags"
-    path = Rails.root.join(outDIR, fileName).to_s
-
-    if !File.file?(path+".ftree") || args.infomap
-      graphPath = path+".net"
-      hash = create_hash()
-      create_pajek_net_graph(hash, graphPath)
-
-      puts "\nCalling infomap with the pajek graph\n\n"
-      system("infomap --ftree #{graphPath} #{Rails.root.join(outDIR)}")
-    end
-
-    # create cluster tree from ftree
-    tags = parseTree(path)
-
-    puts "\nStarted Calculating relevant results for tag #{$TAG.name}\n"
-
-    relevant_los = find_relevant_results($TAG.id, tags)
-
-    puts "\n============ Learning Objects - Cluster Search ===============\n"
-    puts "ID: Ranking | Name | Tags\n\n"
-    relevant_los.each do |id, rank|
-      lo = LearningObject.find(id)
-      puts "-----"
-      p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name)
-      lo.tags.each {|t| print t.name+" | "}
-      puts ""
-    end
-
-    search_los = LearningObject.search $TAG.name, limit: $N_LOS, explain:true
-
-    # print  search_los.response["hits"]["hits"].to_json
-    # p wdfSearch("EducaÃ§Ã£o Infatil")
-    # p tagWdfSorted()
-
-    puts "\n============ Learning Objects - Elasticsearch ===============\n"
-    search_los.each do |lo|
-      puts "#{lo.id}: #{lo.search_hit['_score']}"
-    end
-    puts "ID: Ranking | Name | Tags\n\n"
-    search_los.each do |lo|
-      puts "-----"
-      p lo.id.to_s+": "+lo.search_hit["_score"].to_s+" | "+lo.try(:name)
-      lo.tags.each {|t| print t.name+" | "}
-      puts ""
-    end
-
-    merged_los = merge(search_los, relevant_los)
-
-    puts "\n============ Learning Objects - Merged ===============\n"
-    merged_los.each do |id, rank|
-      puts "#{id}: #{rank}"
-    end
-    puts "ID: Ranking | Name | Tags\n\n"
-    merged_los.each do |id, rank|
-      lo = LearningObject.find(id)
-      puts "-----"
-      p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name)
-      lo.tags.each {|t| print t.name+" | "}
-      puts ""
-    end
+    # puts "\nCalling infomap with the pajek graph\n\n" if DEBUG
+    system("infomap --ftree #{graph_path} #{TagSearchService.root_dir}")
 
+    Rails.cache.delete(TagSearchService::CACHE_KEY)
   end # task
 
   private
 
-  def tagWdfSorted()
-    tagWdf = []
-
-    Tag.all.each_with_index do |t,i|
-      df = wdfSearch(t.name)
-      print "ERROR #{t.name}\n" if df == -1
-      tagWdf << [t.id, df, t.name] if df != -1
-    end
-
-    tagWdf = tagWdf.sort_by { |t| t[1] }
-
-    return tagWdf
-  end
-
-  def findScores(obj)
-    if !obj["description"].match(/weight\(\w*\.word_start:/).nil?
-      if obj["details"][0]["details"][0]["description "] == "boost" && obj["details"][0]["details"][0]["value"] == 10
-        if obj["details"][0]["details"][1]["details"][0]["description"] == "docFreq"
-          return obj["details"][0]["details"][1]["details"][0]["value"]
-        else
-          return 0
-        end
-      else
-        return 0
-      end
-    end
-    sum = 0
-    obj["details"].each do |detail|
-      sum += findScores(detail)
-    end
-    return sum
-  end
-
-  def wdfSearch(tname)
-    search_los = LearningObject.search tname, limit: 1, explain:true
-    return -1 if search_los.blank?
-    details = search_los.response['hits']['hits'][0]['_explanation']
-
-    sum = findScores(details)
-    return sum
-  end
-
-  def merge(search_los, relevant_los)
-    puts "\n---------------------- MERGING -----------------------"
-    merged_los = []
-
-    max = search_los.first.search_hit['_score']
-    min = search_los[search_los.size-1].search_hit['_score']
-    # min = 0
-    max_boost = 0
-    response = search_los.response['hits']['hits']
-    search_los.each_with_index do |slo, i|
-      detail = response[i]['_explanation']['details'][0]
-      while detail['description'] != "boost"
-        detail = detail['details'][0]
-      end
-      boost = detail['value']
-      max_boost = boost if boost > max_boost
-
-      slo.search_hit['_score'] = boost*(slo.search_hit['_score']*(1+$PADM)-min)/(max-min)
-    end
-
-    max = relevant_los.first[1]
-    min = relevant_los.last[1]
-    puts "\nMax boost found: "+max_boost.to_s
-    relevant_los.each do |rlo|
-      rlo[1] = max_boost*(rlo[1]*(1+$PADM)-min)/(max-min)
-    end
-
-    search_los.each do |slo|
-      relevant_los.each_with_index do |rlo, index|
-        if slo.id == rlo[0]
-          slo.search_hit['_score'] = slo.search_hit['_score'] + rlo[1]
-          relevant_los.delete_at(index)
-        end
-      end
-      merged_los << [slo.id, slo.search_hit['_score']]
-    end
-
-    merged_los.push(*relevant_los)
-    merged_los = merged_los.sort_by { |lo| lo[1]*-1 }
-    return merged_los.first(50)
-  end
-
   # hash[id1][id2] will equal how many times tags with id1 and id2 appear together on a LO
   def create_hash()
-    puts "\nCreating hash of tag concurrences\n"
+    # puts "\nCreating hash of tag concurrences\n" if DEBUG
     hash = {}
 
     LearningObject.all.each do |lo|
@@ -235,9 +72,8 @@ namespace :tag do
     return hash
   end
 
-
   def create_pajek_net_graph(hash, path)
-    puts "\nCreating pajek net graph on path #{path}\n"
+    # puts "\nCreating pajek net graph on path #{path}\n" if DEBUG
     File.open(path, "w+") do |f|
       f << "*Vertices #{Tag.all.size}\n"
 
@@ -260,200 +96,4 @@ namespace :tag do
       end
     end
   end
-
-  # ranking #
-
-  def ranked_close_tags(tagId, tags)
-    close_tags = []
-
-    tags[tagId][:parent][:childs].each do |t1|
-      # calculate logarithmic distance between tag flows
-      # lower value, closer, more relevant
-      # the tag you are searching for will be at distance 0 of itself
-      lg_dist = ( Math.log2(tags[tagId][:flow]) - Math.log2(t1[:flow]) ).abs
-
-      close_tags << { id: t1[:id], rank: lg_dist}
-    end
-
-    return normalize_complement_close(close_tags)
-  end
-
-  # normalizes and complements, 0 distance will be 1,
-  # max dist will be closest to 0
-  def normalize_complement_close(tags)
-    max = 0
-
-    # find max rank
-    tags.each do |t|
-      max = t[:rank] if t[:rank] > max
-    end
-
-    # normalize, min will always be 0
-    tags.each do |t|
-      # increase max by $PAD so its rank isn't 0
-      t[:rank] = 1 - (t[:rank]/(max*(1 + $PAD)))
-    end
-
-    return tags
-  end
-
-  def find_relevant_results(tagId, tags)
-    los_ranked = {}
-
-    puts "\nGetting tags from the same cluster\n"
-    puts "Normalization with padding = #{$PAD}\n"
-    close_tags = ranked_close_tags(tagId, tags)
-
-    # Uncomment the next line if you want to sort by global tag frequency
-    # freq_cluster = cluster_frequency(close_tags)
-
-    puts "\nStarted Ranking LOs...\n"
-
-    puts "\n====== Close Tags =========\n"
-    puts "Name | ID | Normalized Ranking\n\n"
-    close_tags.each do |ct|
-      tag = Tag.find(ct[:id])
-      p tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tags[ct[:id]][:rank].to_s
-      tag.taggings.where(taggable_type: "LearningObject").each do |tagging|
-        lo = tagging.taggable
-        if los_ranked[lo.id].nil?
-          # Uncomment the next line if you want to sort by local tag frequency
-          # los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags)
-
-          # Uncomment the next line if you want to sort by global tag frequency
-          # los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster)
-
-          # Uncomment the next line if you want to sort by tag cluster rank
-          los_ranked[lo.id] = relevance_raw_rank(lo, close_tags)
-        end
-      end
-    end
-    puts "============\n"
-
-    puts "\nSorting LOs...\n"
-    # sorts by its ranking
-    los_ranked = los_ranked.sort_by { |id, rank| rank }
-    # get highest ranks
-    los_ranked = los_ranked.reverse.first($N_LOS)
-
-    los_ranked.each do |key, value|
-      puts "#{key}: #{value}"
-    end
-
-    return los_ranked
-  end
-
-  def relevance_frequency_rank(lo, close_tags)
-    itf_sum = 0
-
-    wdf = 0
-    wdf = 1/(Math.log2(lo.tags.size)+1) if lo.tags.size != 0
-
-    lo.tags.each do |t|
-      close_tags.each do |tag|
-        if tag[:id] == t.id
-          itf_sum += tag[:rank]*(Math.log2(Tag.all.size/t.taggings.size)+1)
-        end
-      end
-    end
-
-    return wdf*itf_sum
-  end
-
-  # returns the sum of how many times each tag in cluster appears in space
-  def cluster_frequency(cluster)
-    freq_cluster = 0
-    cluster.each do |t|
-      freq_cluster += Tag.find(t[:id]).taggings.size
-    end
-    return freq_cluster
-  end
-
-  def relevance_frequency_rank_global(lo, close_tags, freq_cluster)
-    # for each tag in LO that is in the cluster, accumulate it's rank
-    rank_sum = 1
-    lo.tags.each do |t|
-      close_tags.each do |tag|
-        if tag[:id] == t.id
-          rank_sum += tag[:rank]
-        end
-      end
-    end
-
-    wdf = 0
-    wdf = (Math.log2(rank_sum)/(Math.log2(lo.tags.size)+1)) if lo.tags.size != 0
-
-    itf = Math.log2(Tag.all.size/freq_cluster)+1
-
-    return wdf*itf
-  end
-
-  # returns the rank sum of the tags in the LO
-  def relevance_raw_rank(lo, close_tags)
-    # for each tag in LO that is in the cluster, accumulate it's rank
-    rank_sum = 0
-    lo.tags.each do |t|
-      close_tags.each do |tag|
-        if tag[:id] == t.id
-          rank_sum += tag[:rank]
-        end
-      end
-    end
-    return rank_sum
-  end
-
-  def parseTree(path)
-    # parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes
-    # create tags list, tags[tagId] == the tag leaf inside the cluster tree
-
-    puts "\nParsing .ftree output into a 'tree'\n"
-
-    clusters = {childs: [], parent: nil}
-    tags = {}
-    countClusters = {}
-
-    File.open(path+".ftree", "r") do |f|
-      f.gets
-      f.gets
-
-      while line = f.gets
-        break if !line.include? ':'
-
-        tmp = line.split(' ')
-        # tmp[0] format like: 2:7:5:4, 2 an 7 have clusters as childs, 5 has leafs, 4 is one of them (tag)
-        # get id of each level of the tree
-        ftree = tmp[0].split(':')[0..-2]
-        # last number of the sequence is the leaf Id
-        leafId = tmp[0].split(':')[-1].to_i
-
-        # last number on the line is the tag Id
-        tagId = tmp[-1].to_i
-        # second number on the line
-        flow = tmp[1].to_f
-        # between the third and second to last is where the name lies
-        name = tmp[2..-2].join(' ')[1..-2]
-
-        # iterate through the levels of the tree
-        it = clusters # start at the root
-        ftree.each do |clusterId|
-          clusterId = clusterId.to_i - 1 # on the file they start at 1, we want 0
-          if it[:childs][clusterId].nil? # if this id doesn't exist, create it as child of 'it'
-            it[:childs][clusterId] = {childs: [], parent: nil}
-            it[:childs][clusterId][:parent] = it
-          end
-          # go down a level
-          it = it[:childs][clusterId]
-        end
-        countClusters[it] = 1 # set this cluster in this hash, for counting purposes
-        # 'it' is the cluster leafId is a child of, so add it
-        it[:childs][leafId-1] = {id: tagId, flow: flow, name: name, parent: it}
-        # put the leaf on this hash for easy acess by the tagId
-        tags[tagId] = it[:childs][leafId-1]
-      end
-    end
-
-    puts "\nNumber of clusters found: #{countClusters.size}\n"
-    return tags
-  end
-
 end # namespace
-- 
GitLab