From 4a76d518b798b88d2f67a33aee8859c97589fb4c Mon Sep 17 00:00:00 2001 From: bfs15 <bruno.serbena@gmail.com> Date: Fri, 14 Jul 2017 12:15:36 -0300 Subject: [PATCH] Added new way of finding LOs in the same cluster & changed names --- app/services/tag_search_service.rb | 123 ++++++++++++++++++++++++----- lib/tasks/tag_clustering.rake | 6 +- 2 files changed, 104 insertions(+), 25 deletions(-) diff --git a/app/services/tag_search_service.rb b/app/services/tag_search_service.rb index 0a1dafd1..55252ed1 100644 --- a/app/services/tag_search_service.rb +++ b/app/services/tag_search_service.rb @@ -19,7 +19,8 @@ module TagSearchService # DEBUG = true - CACHE_KEY = "tag_clusters".freeze + FTREE_CACHE_KEY = "ftree".freeze + LO_CLUSTER_CACHE_KEY = "lo_cluster".freeze PAD = 1 PADM = 0.05 @@ -31,19 +32,32 @@ module TagSearchService Rails.root.join(root_dir, "tags").to_s end - def tags_cluster - Rails.cache.fetch(CACHE_KEY) do - parseFtree(file_path) + def parse_ftree_cache + Rails.cache.fetch(FTREE_CACHE_KEY) do + parse_ftree(file_path) + end + end + + def lo_cluster_cache(tag_clusters) + Rails.cache.fetch(LO_CLUSTER_CACHE_KEY) do + lo_cluster(tag_clusters) end end def search(tag, limit = -1) # Create clusters from ftree - tags = tags_cluster() + ftree = parse_ftree_cache() + tag_cluster_hash = ftree[:tag_cluster_hash] + + # NEW: Uncomment to test this, see if it takes too long for its benefits + # if it's good, put it in the tag_clustering.rake + # lo_cluster = lo_cluster_cache(ftree[:tag_clusters]) # puts "\nStarted Calculating relevant results for tag #{tag.name}\n" if DEBUG - relevant_los = find_relevant_results(tag.id, tags, limit) + relevant_los = find_relevant_results(tag.id, tag_cluster_hash, limit) + # relevant_los = find_relevant_results_v2(tag.id, tag_cluster_hash, lo_cluster, limit) ## + # NEW: Uncomment to test this, see if it takes too long for its benefits # puts "\n============ Learning Objects - Cluster Search ===============\n" # puts "ID: Ranking | Name | Tags\n\n" @@ -138,12 +152,12 @@ module TagSearchService return merged_los end - def find_relevant_results(tagId, tags, limit) + def find_relevant_results(tagId, tag_cluster_hash, limit) los_ranked = {} # puts "\nGetting tags from the same cluster\n" if DEBUG # puts "Normalization with padding = #{PAD}\n" if DEBUG - close_tags = ranked_close_tags(tagId, tags) + close_tags = ranked_close_tags(tagId, tag_cluster_hash) # Uncomment the next line if you want to sort by global tag frequency # freq_cluster = cluster_frequency(close_tags) @@ -154,7 +168,7 @@ module TagSearchService # puts "Name | ID | Normalized Ranking\n\n" if DEBUG close_tags.each do |ct| tag = Tag.find(ct[:id]) - # puts tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tags[ct[:id]][:rank].to_s if DEBUG + # puts tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tag_cluster_hash[ct[:id]][:rank].to_s if DEBUG tag.taggings.where(taggable_type: "LearningObject").each do |tagging| lo = tagging.taggable if los_ranked[lo].nil? @@ -179,15 +193,15 @@ module TagSearchService end # ranking # - def ranked_close_tags(tagId, tags) + def ranked_close_tags(tagId, tag_cluster_hash) # puts "Rank close tags" if DEBUG close_tags = [] - tags[tagId][:parent][:childs].each do |t| + tag_cluster_hash[tagId][:parent][:childs].each do |t| # calculate logarithmic distance between tag flows # lower value, closer, more relevant # the tag you are searching for will be at distance 0 of itself - lg_dist = ( Math.log2(tags[tagId][:flow]) - Math.log2(t[:flow]) ).abs + lg_dist = ( Math.log2(tag_cluster_hash[tagId][:flow]) - Math.log2(t[:flow]) ).abs close_tags << { id: t[:id], rank: lg_dist} # puts "Rank for tag_id=#{close_tags[close_tags.length - 1][:id]}: #{close_tags[close_tags.length - 1][:rank]}" if DEBUG @@ -281,15 +295,15 @@ module TagSearchService return rank_sum end - def parseFtree(path) + def parse_ftree(path) # parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes # create tags list, tags[tagId] == the tag leaf inside the cluster tree # puts "\nParsing .ftree output into a 'tree'\n" if DEBUG - clusters = {childs: [], parent: nil} - tags = {} - countClusters = {} + root = {childs: [], parent: nil} + tag_cluster_hash = {} + tag_clusters = [] File.open(path+".ftree", "r") do |f| f.gets @@ -313,7 +327,7 @@ module TagSearchService name = tmp[2..-2].join(' ')[1..-2] # iterate through the levels of the tree - it = clusters # start at the root + it = root # restart at the root ftree.each do |clusterId| clusterId = clusterId.to_i - 1 # on the file they start at 1, we want 0 if it[:childs][clusterId].nil? # if this id doesn't exist, create it as child of 'it' @@ -323,16 +337,83 @@ module TagSearchService # go down a level it = it[:childs][clusterId] end - countClusters[it] = 1 # set this cluster in this hash, for counting purposes + tag_clusters << it # 'it' is the cluster leafId is a child of, so add it it[:childs][leafId-1] = {id: tagId, flow: flow, name: name, parent: it} # put the leaf on this hash for easy acess by the tagId - tags[tagId] = it[:childs][leafId-1] + tag_cluster_hash[tagId] = it[:childs][leafId-1] end end - # puts "\nNumber of clusters found: #{countClusters.size}\n" if DEBUG + # puts "\nNumber of clusters found: #{tag_clusters.size}\n" if DEBUG - return tags + return {tag_cluster_hash: tag_cluster_hash, tag_clusters: tag_clusters} + end + + # NEW: test this, see if it takes too long for its benefits + + def find_relevant_results_v2(tagId, tag_cluster_hash, lo_cluster, limit) + los_ranked = {} + + # puts "\nGetting tags from the same cluster\n" if DEBUG + # puts "Normalization with padding = #{PAD}\n" if DEBUG + close_tags = ranked_close_tags(tagId, tag_cluster_hash) + + # Uncomment the next line if you want to sort by global tag frequency + # freq_cluster = cluster_frequency(close_tags) + + # puts "\nStarted Ranking LOs...\n" if DEBUG + + # puts "\n====== Close Tags =========\n" if DEBUG + # puts "Name | ID | Normalized Ranking\n\n" if DEBUG + + # NEW: for each LO in the cluster of this tag + lo_cluster[tagId].each do |lo_id| + lo = LearningObject.find(lo_id) + # Uncomment the next line if you want to sort by local tag frequency + # los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags) + + # Uncomment the next line if you want to sort by global tag frequency + # los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster) + + # Uncomment the next line if you want to sort by tag cluster rank + los_ranked[lo] = relevance_raw_rank(lo, close_tags) + # puts "Found lo of id=#{lo.id} with rank=#{los_ranked[lo.id]}" if DEBUG + end + + # puts "Sorting LOs...\n" if DEBUG + # highest ranks first + los_ranked = los_ranked.sort_by { |lo, rank| rank*-1 } + return los_ranked.first(limit) if limit > 0 + return los_ranked end + + # returns a hash keyed by tag id, + # contains an array with all learning object ids that the cluster contains + def lo_clusters(tag_clusters) + lo_clusters = {} + + tag_clusters.each do |tag_cluster| + # for each cluster, find all LOs tagged by the tags in the cluster + # insert their ids in lo_cluster array + lo_cluster = [] + aux = {} + # for each tag in the cluster + tag_cluster[:childs].each do |leaf| + tag_id = leaf[:id] + # save the reference in the tag_id hash + lo_clusters[tag_id] = lo_cluster + tag = Tag.find(tag_id) + # find LOs with this tag + tag.taggings.where(taggable_type: "LearningObject").each do |tagging| + lo = tagging.taggable + if aux[lo.id].nil? + aux[lo.id] = 1 + lo_cluster << lo.id + end + end + end + end + return lo_clusters + end end diff --git a/lib/tasks/tag_clustering.rake b/lib/tasks/tag_clustering.rake index a234d809..3755acb5 100644 --- a/lib/tasks/tag_clustering.rake +++ b/lib/tasks/tag_clustering.rake @@ -33,10 +33,8 @@ namespace :tag do infomap_ftree(graph_path, TagSearchService.root_dir) # Cluster needs to be read from disk again, so clear cache of TagSearchService - Rails.cache.delete(TagSearchService::CACHE_KEY) - Rails.cache.fetch(TagSearchService::CACHE_KEY) do - parseFtree(file_path) - end + Rails.cache.delete(TagSearchService::FTREE_CACHE_KEY) + TagSearchService::parse_ftree_cache() end # task private -- GitLab