From 4a76d518b798b88d2f67a33aee8859c97589fb4c Mon Sep 17 00:00:00 2001
From: bfs15 <bruno.serbena@gmail.com>
Date: Fri, 14 Jul 2017 12:15:36 -0300
Subject: [PATCH] Added new way of finding LOs in the same cluster & changed
 names

---
 app/services/tag_search_service.rb | 123 ++++++++++++++++++++++++-----
 lib/tasks/tag_clustering.rake      |   6 +-
 2 files changed, 104 insertions(+), 25 deletions(-)

diff --git a/app/services/tag_search_service.rb b/app/services/tag_search_service.rb
index 0a1dafd1..55252ed1 100644
--- a/app/services/tag_search_service.rb
+++ b/app/services/tag_search_service.rb
@@ -19,7 +19,8 @@
 
 module TagSearchService
   # DEBUG = true
-  CACHE_KEY = "tag_clusters".freeze
+  FTREE_CACHE_KEY = "ftree".freeze
+  LO_CLUSTER_CACHE_KEY = "lo_cluster".freeze
   PAD = 1
   PADM = 0.05
 
@@ -31,19 +32,32 @@ module TagSearchService
     Rails.root.join(root_dir, "tags").to_s
   end
 
-  def tags_cluster
-    Rails.cache.fetch(CACHE_KEY) do
-      parseFtree(file_path)
+  def parse_ftree_cache
+    Rails.cache.fetch(FTREE_CACHE_KEY) do
+      parse_ftree(file_path)
+    end
+  end
+
+  def lo_cluster_cache(tag_clusters)
+    Rails.cache.fetch(LO_CLUSTER_CACHE_KEY) do
+      lo_cluster(tag_clusters)
     end
   end
 
   def search(tag, limit = -1)
     # Create clusters from ftree
-    tags = tags_cluster()
+    ftree = parse_ftree_cache()
+    tag_cluster_hash = ftree[:tag_cluster_hash]
+
+    # NEW: Uncomment to test this, see if it takes too long for its benefits
+    # if it's good, put it in the tag_clustering.rake
+    # lo_cluster = lo_cluster_cache(ftree[:tag_clusters])
 
     # puts "\nStarted Calculating relevant results for tag #{tag.name}\n" if DEBUG
 
-    relevant_los = find_relevant_results(tag.id, tags, limit)
+    relevant_los = find_relevant_results(tag.id, tag_cluster_hash, limit)
+    # relevant_los = find_relevant_results_v2(tag.id, tag_cluster_hash, lo_cluster, limit) ##
+    # NEW: Uncomment to test this, see if it takes too long for its benefits
 
     # puts "\n============ Learning Objects - Cluster Search ===============\n"
     # puts "ID: Ranking | Name | Tags\n\n"
@@ -138,12 +152,12 @@ module TagSearchService
     return merged_los
   end
 
-  def find_relevant_results(tagId, tags, limit)
+  def find_relevant_results(tagId, tag_cluster_hash, limit)
     los_ranked = {}
 
     # puts "\nGetting tags from the same cluster\n" if DEBUG
     # puts "Normalization with padding = #{PAD}\n" if DEBUG
-    close_tags = ranked_close_tags(tagId, tags)
+    close_tags = ranked_close_tags(tagId, tag_cluster_hash)
 
     # Uncomment the next line if you want to sort by global tag frequency
     # freq_cluster = cluster_frequency(close_tags)
@@ -154,7 +168,7 @@ module TagSearchService
     # puts "Name | ID | Normalized Ranking\n\n" if DEBUG
     close_tags.each do |ct|
       tag = Tag.find(ct[:id])
-      # puts tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tags[ct[:id]][:rank].to_s if DEBUG
+      # puts tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tag_cluster_hash[ct[:id]][:rank].to_s if DEBUG
       tag.taggings.where(taggable_type: "LearningObject").each do |tagging|
         lo = tagging.taggable
         if los_ranked[lo].nil?
@@ -179,15 +193,15 @@ module TagSearchService
   end
 
   # ranking #
-  def ranked_close_tags(tagId, tags)
+  def ranked_close_tags(tagId, tag_cluster_hash)
     # puts "Rank close tags" if DEBUG
     close_tags = []
 
-    tags[tagId][:parent][:childs].each do |t|
+    tag_cluster_hash[tagId][:parent][:childs].each do |t|
       # calculate logarithmic distance between tag flows
       # lower value, closer, more relevant
       # the tag you are searching for will be at distance 0 of itself
-      lg_dist = ( Math.log2(tags[tagId][:flow]) - Math.log2(t[:flow]) ).abs
+      lg_dist = ( Math.log2(tag_cluster_hash[tagId][:flow]) - Math.log2(t[:flow]) ).abs
 
       close_tags << { id: t[:id], rank: lg_dist}
       # puts "Rank for tag_id=#{close_tags[close_tags.length - 1][:id]}: #{close_tags[close_tags.length - 1][:rank]}" if DEBUG
@@ -281,15 +295,15 @@ module TagSearchService
     return rank_sum
   end
 
-  def parseFtree(path)
+  def parse_ftree(path)
     # parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes
     # create tags list, tags[tagId] == the tag leaf inside the cluster tree
 
     # puts "\nParsing .ftree output into a 'tree'\n" if DEBUG
 
-    clusters = {childs: [], parent: nil}
-    tags = {}
-    countClusters = {}
+    root = {childs: [], parent: nil}
+    tag_cluster_hash = {}
+    tag_clusters = []
 
     File.open(path+".ftree", "r") do |f|
       f.gets
@@ -313,7 +327,7 @@ module TagSearchService
         name = tmp[2..-2].join(' ')[1..-2]
 
         # iterate through the levels of the tree
-        it = clusters # start at the root
+        it = root # restart at the root
         ftree.each do |clusterId|
           clusterId = clusterId.to_i - 1 # on the file they start at 1, we want 0
           if it[:childs][clusterId].nil? # if this id doesn't exist, create it as child of 'it'
@@ -323,16 +337,83 @@ module TagSearchService
           # go down a level
           it = it[:childs][clusterId]
         end
-        countClusters[it] = 1 # set this cluster in this hash, for counting purposes
+        tag_clusters << it
         # 'it' is the cluster leafId is a child of, so add it
         it[:childs][leafId-1] = {id: tagId, flow: flow, name: name, parent: it}
         # put the leaf on this hash for easy acess by the tagId
-        tags[tagId] = it[:childs][leafId-1]
+        tag_cluster_hash[tagId] = it[:childs][leafId-1]
       end
     end
-    # puts "\nNumber of clusters found: #{countClusters.size}\n" if DEBUG
+    # puts "\nNumber of clusters found: #{tag_clusters.size}\n" if DEBUG
 
-    return tags
+    return {tag_cluster_hash: tag_cluster_hash, tag_clusters: tag_clusters}
+  end
+
+  # NEW: test this, see if it takes too long for its benefits
+
+  def find_relevant_results_v2(tagId, tag_cluster_hash, lo_cluster, limit)
+    los_ranked = {}
+
+    # puts "\nGetting tags from the same cluster\n" if DEBUG
+    # puts "Normalization with padding = #{PAD}\n" if DEBUG
+    close_tags = ranked_close_tags(tagId, tag_cluster_hash)
+
+    # Uncomment the next line if you want to sort by global tag frequency
+    # freq_cluster = cluster_frequency(close_tags)
+
+    # puts "\nStarted Ranking LOs...\n" if DEBUG
+
+    # puts "\n====== Close Tags =========\n" if DEBUG
+    # puts "Name | ID | Normalized Ranking\n\n" if DEBUG
+
+    # NEW: for each LO in the cluster of this tag
+    lo_cluster[tagId].each do |lo_id|
+      lo = LearningObject.find(lo_id)
+      # Uncomment the next line if you want to sort by local tag frequency
+      # los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags)
+
+      # Uncomment the next line if you want to sort by global tag frequency
+      # los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster)
+
+      # Uncomment the next line if you want to sort by tag cluster rank
+      los_ranked[lo] = relevance_raw_rank(lo, close_tags)
+      # puts "Found lo of id=#{lo.id} with rank=#{los_ranked[lo.id]}" if DEBUG
+    end
+
+    # puts "Sorting LOs...\n" if DEBUG
+    # highest ranks first
+    los_ranked = los_ranked.sort_by { |lo, rank| rank*-1 }
+    return los_ranked.first(limit) if limit > 0
+    return los_ranked
   end
 
+
+  # returns a hash keyed by tag id,
+  # contains an array with all learning object ids that the cluster contains
+  def lo_clusters(tag_clusters)
+    lo_clusters = {}
+
+    tag_clusters.each do |tag_cluster|
+      # for each cluster, find all LOs tagged by the tags in the cluster
+      # insert their ids in lo_cluster array
+      lo_cluster = []
+      aux = {}
+      # for each tag in the cluster
+      tag_cluster[:childs].each do |leaf|
+        tag_id = leaf[:id]
+        # save the reference in the tag_id hash
+        lo_clusters[tag_id] = lo_cluster
+        tag = Tag.find(tag_id)
+        # find LOs with this tag
+        tag.taggings.where(taggable_type: "LearningObject").each do |tagging|
+          lo = tagging.taggable
+          if aux[lo.id].nil?
+            aux[lo.id] = 1
+            lo_cluster << lo.id
+          end
+        end
+      end
+    end
+    return lo_clusters
+  end
 end
diff --git a/lib/tasks/tag_clustering.rake b/lib/tasks/tag_clustering.rake
index a234d809..3755acb5 100644
--- a/lib/tasks/tag_clustering.rake
+++ b/lib/tasks/tag_clustering.rake
@@ -33,10 +33,8 @@ namespace :tag do
     infomap_ftree(graph_path, TagSearchService.root_dir)
 
     # Cluster needs to be read from disk again, so clear cache of TagSearchService
-    Rails.cache.delete(TagSearchService::CACHE_KEY)
-    Rails.cache.fetch(TagSearchService::CACHE_KEY) do
-      parseFtree(file_path)
-    end
+    Rails.cache.delete(TagSearchService::FTREE_CACHE_KEY)
+    TagSearchService::parse_ftree_cache()
   end # task
 
   private
-- 
GitLab