diff --git a/app/services/tag_search_service.rb b/app/services/tag_search_service.rb index 2d973fb9132e57cbc9d2880c82fb3d7c6864b775..2744f5f4b35291d512abe43a3d9c83145d69a164 100644 --- a/app/services/tag_search_service.rb +++ b/app/services/tag_search_service.rb @@ -20,7 +20,6 @@ module TagSearchService # DEBUG = true FTREE_CACHE_KEY = "ftree".freeze - LO_CLUSTER_CACHE_KEY = "lo_cluster".freeze PAD = 1 PADM = 0.05 @@ -38,77 +37,25 @@ module TagSearchService end end - def lo_cluster_cache(tag_clusters) - Rails.cache.fetch(LO_CLUSTER_CACHE_KEY) do - lo_clusters(tag_clusters) - end - end - def search(tag, limit = -1) # Create clusters from ftree ftree = parse_ftree_cache() tag_cluster_hash = ftree[:tag_cluster_hash] - # NEW: Uncomment to test this, see if it takes too long for its benefits - # if it's good, put it in the tag_clustering.rake - # lo_cluster = lo_cluster_cache(ftree[:tag_clusters]) - - # puts "\nStarted Calculating relevant results for tag #{tag.name}\n" if DEBUG - relevant_los = find_relevant_results(tag.id, tag_cluster_hash, limit) - # relevant_los = find_relevant_results_v2(tag.id, tag_cluster_hash, lo_cluster, limit) ## - # NEW: Uncomment to test this, see if it takes too long for its benefits - - # puts "\n============ Learning Objects - Cluster Search ===============\n" - # puts "ID: Ranking | Name | Tags\n\n" - # relevant_los.each do |id, rank| - # lo = LearningObject.find(id) - # # puts "-----" - # p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name) - # lo.tags.each {|t| print t.name+" | "} - # # puts "" - # end search_los = LearningObject.search tag.name, limit: limit, explain:true - # puts "\n============ Learning Objects - Elasticsearch ===============\n" - # search_los.each do |lo| - # # puts "#{lo.id}: #{lo.search_hit['_score']}" - # end - # puts "ID: Ranking | Name | Tags\n\n" - # search_los.each do |lo| - # # puts "-----" - # p lo.id.to_s+": "+lo.search_hit["_score"].to_s+" | "+lo.try(:name) - # lo.tags.each {|t| print t.name+" | "} - # # puts "" - # end - merged_los = merge(search_los, relevant_los) - # puts "\n============ Learning Objects - Merged ===============\n" - # merged_los.each do |id, rank| - # # puts "#{id}: #{rank}" - # end - # # puts "ID: Ranking | Name | Tags\n\n" - # merged_los.each do |id, rank| - # lo = LearningObject.find(id) - # # puts "-----" - # p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name) - # lo.tags.each {|t| print t.name+" | "} - # # puts "" - # end - return merged_los.map { |lc| lc[0] } end def merge(search_los, relevant_los) - # puts "\n---------------------- MERGING -----------------------" if DEBUG merged_los = [] - # puts "Normalize search scores" if DEBUG max = search_los.first.search_hit['_score'] min = search_los[search_los.size-1].search_hit['_score'] - # puts "Max=#{max} Min=#{min}" if DEBUG max_boost = 0 response = search_los.response['hits']['hits'] search_los.each_with_index do |slo, i| @@ -120,20 +67,15 @@ module TagSearchService max_boost = boost if boost > max_boost slo.search_hit['_score'] = boost*(slo.search_hit['_score']*(1+PADM)-min)/(max-min) - # puts "#{slo.search_hit['_score']}" if DEBUG end - # puts "\nNormalize relevant scores" if DEBUG max = relevant_los.first[1] min = relevant_los.last[1] - # puts "Max=#{max} Min=#{min}" if DEBUG relevant_los.each do |rlc| rlc[1] = max_boost*(rlc[1]*(1+PADM)-min)/(max-min) - # puts "#{rlc[1]}" if DEBUG end if max != 0 - # puts "Changing search array format" if DEBUG search_los.each do |slo| relevant_los.each_with_index do |rlc, index| if slo.id == rlc[0].id @@ -142,11 +84,9 @@ module TagSearchService end end merged_los << [slo, slo.search_hit['_score']] - # puts "#{merged_los[merged_los.length - 1][0]}: #{merged_los[merged_los.length - 1][1]}" if DEBUG end merged_los.push(*relevant_los) - # puts "Sorting LOs...\n" if DEBUG # highest ranks first merged_los = merged_los.sort_by { |lc| lc[1]*-1 } return merged_los @@ -155,20 +95,13 @@ module TagSearchService def find_relevant_results(tagId, tag_cluster_hash, limit) los_ranked = {} - # puts "\nGetting tags from the same cluster\n" if DEBUG - # puts "Normalization with padding = #{PAD}\n" if DEBUG close_tags = ranked_close_tags(tagId, tag_cluster_hash) # Uncomment the next line if you want to sort by global tag frequency # freq_cluster = cluster_frequency(close_tags) - # puts "\nStarted Ranking LOs...\n" if DEBUG - - # puts "\n====== Close Tags =========\n" if DEBUG - # puts "Name | ID | Normalized Ranking\n\n" if DEBUG close_tags.each do |ct| tag = Tag.find(ct[:id]) - # puts tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tag_cluster_hash[ct[:id]][:rank].to_s if DEBUG tag.taggings.where(taggable_type: "LearningObject").each do |tagging| lo = tagging.taggable if los_ranked[lo].nil? @@ -180,12 +113,10 @@ module TagSearchService # Uncomment the next line if you want to sort by tag cluster rank los_ranked[lo] = relevance_raw_rank(lo, close_tags) - # puts "Found lo of id=#{lo.id} with rank=#{los_ranked[lo.id]}" if DEBUG end end end - # puts "Sorting LOs...\n" if DEBUG # highest ranks first los_ranked = los_ranked.sort_by { |lo, rank| rank*-1 } return los_ranked.first(limit) if limit > 0 @@ -194,7 +125,6 @@ module TagSearchService # ranking # def ranked_close_tags(tagId, tag_cluster_hash) - # puts "Rank close tags" if DEBUG close_tags = [] tag_cluster_hash[tagId][:parent][:childs].each do |t| @@ -204,7 +134,6 @@ module TagSearchService lg_dist = ( Math.log2(tag_cluster_hash[tagId][:flow]) - Math.log2(t[:flow]) ).abs close_tags << { id: t[:id], rank: lg_dist} - # puts "Rank for tag_id=#{close_tags[close_tags.length - 1][:id]}: #{close_tags[close_tags.length - 1][:rank]}" if DEBUG end return normalize_complement_close(close_tags) @@ -213,20 +142,17 @@ module TagSearchService # normalizes and complements, 0 distance will be 1, # max dist will be closest to 0 def normalize_complement_close(tags) - # puts "Normalize close tags" if DEBUG max = 0 # find max rank tags.each do |t| max = t[:rank] if t[:rank] > max end - # puts "Max rank found = #{max}" if DEBUG # normalize, min will always be 0 tags.each do |t| # increase max by $PAD so its rank isn't 0 t[:rank] = 1 - (t[:rank]/(max*(1 + PAD))) - # puts "Rank for tag_id=#{t[:id]}: #{t[:rank] }" if DEBUG end if max != 0 return tags @@ -281,12 +207,10 @@ module TagSearchService # returns the rank sum of the tags in the LO def relevance_raw_rank(lo, close_tags) # for each tag in LO that is in the cluster, accumulate it's rank - # puts "Calculate relevance_raw_rank for lo if id=#{lo.id}" if DEBUG rank_sum = 0 lo.tags.each do |t| close_tags.each do |tag| if tag[:id] == t.id - # puts rank_sum if DEBUG rank_sum += tag[:rank] end end @@ -299,8 +223,6 @@ module TagSearchService # parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes # create tags list, tags[tagId] == the tag leaf inside the cluster tree - # puts "\nParsing .ftree output into a 'tree'\n" if DEBUG - root = {childs: [], parent: nil} tag_cluster_hash = {} tag_clusters = [] @@ -344,76 +266,8 @@ module TagSearchService tag_cluster_hash[tagId] = it[:childs][leafId-1] end end - # puts "\nNumber of clusters found: #{tag_clusters.size}\n" if DEBUG return {tag_cluster_hash: tag_cluster_hash, tag_clusters: tag_clusters} end - # NEW: test this, see if it takes too long for its benefits - - def find_relevant_results_v2(tagId, tag_cluster_hash, lo_cluster, limit) - los_ranked = {} - - # puts "\nGetting tags from the same cluster\n" if DEBUG - # puts "Normalization with padding = #{PAD}\n" if DEBUG - close_tags = ranked_close_tags(tagId, tag_cluster_hash) - - # Uncomment the next line if you want to sort by global tag frequency - # freq_cluster = cluster_frequency(close_tags) - - # puts "\nStarted Ranking LOs...\n" if DEBUG - - # puts "\n====== Close Tags =========\n" if DEBUG - # puts "Name | ID | Normalized Ranking\n\n" if DEBUG - - # NEW: for each LO in the cluster of this tag - lo_cluster[tagId].each do |lo_id| - lo = LearningObject.find(lo_id) - # Uncomment the next line if you want to sort by local tag frequency - # los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags) - - # Uncomment the next line if you want to sort by global tag frequency - # los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster) - - # Uncomment the next line if you want to sort by tag cluster rank - los_ranked[lo] = relevance_raw_rank(lo, close_tags) - # puts "Found lo of id=#{lo.id} with rank=#{los_ranked[lo.id]}" if DEBUG - end - - # puts "Sorting LOs...\n" if DEBUG - # highest ranks first - los_ranked = los_ranked.sort_by { |lo, rank| rank*-1 } - return los_ranked.first(limit) if limit > 0 - return los_ranked - end - - - # returns a hash keyed by tag id, - # contains an array with all learning object ids that the cluster contains - def lo_clusters(tag_clusters) - lo_clusters = {} - - tag_clusters.each do |tag_cluster| - # for each cluster, find all LOs tagged by the tags in the cluster - # insert their ids in lo_cluster array - lo_cluster = [] - aux = {} - # for each tag in the cluster - tag_cluster[:childs].each do |leaf| - tag_id = leaf[:id] - # save the reference in the tag_id hash - lo_clusters[tag_id] = lo_cluster - tag = Tag.find(tag_id) - # find LOs with this tag - tag.taggings.where(taggable_type: "LearningObject").each do |tagging| - lo = tagging.taggable - if aux[lo.id].nil? - aux[lo.id] = 1 - lo_cluster << lo.id - end - end - end - end - return lo_clusters - end end diff --git a/lib/tasks/tag_clustering.rake b/lib/tasks/tag_clustering.rake index 3755acb58395e89312623f95bae1317f4b2a914e..83a036205ab55c348f7863e328100f3bda400ca3 100644 --- a/lib/tasks/tag_clustering.rake +++ b/lib/tasks/tag_clustering.rake @@ -22,7 +22,6 @@ namespace :tag do desc 'Generate tag clusters' task :generate_clusters => [:environment] do include TagSearchService - # DEBUG = true graph_path = TagSearchService.file_path+".net" # Create hash of tag co occurrence @@ -41,7 +40,6 @@ namespace :tag do # hash[id1][id2] will equal how many times tags with id1 and id2 appear together on a LO def create_hash() - # puts "\nCreating hash of tag concurrences\n" if DEBUG hash = {} LearningObject.all.each do |lo| @@ -75,7 +73,6 @@ namespace :tag do end def create_pajek_net_graph(hash, path) - # puts "\nCreating pajek net graph on path #{path}\n" if DEBUG File.open(path, "w+") do |f| f << "*Vertices #{Tag.all.size}\n" @@ -100,8 +97,7 @@ namespace :tag do end def infomap_ftree(graph_path, out_dir) - # puts "\nCalling infomap with the pajek graph\n\n" if DEBUG - system("Infomap --ftree #{graph_path} #{TagSearchService.root_dir}") + system("infomap --ftree #{graph_path} #{TagSearchService.root_dir}") if $?.exitstatus != 0 puts "-- Error on Infomap call" puts "-- Make sure you have infomap in your executable path"