From 6505ee6850f6d57e5b491619f1b4abe54c80708f Mon Sep 17 00:00:00 2001 From: bfs15 <bruno.serbena@gmail.com> Date: Fri, 7 Jul 2017 11:24:47 -0300 Subject: [PATCH] tag clustering task now only creates .ftree --- lib/tasks/tag_clustering.rake | 380 +--------------------------------- 1 file changed, 10 insertions(+), 370 deletions(-) diff --git a/lib/tasks/tag_clustering.rake b/lib/tasks/tag_clustering.rake index c26721a5..bb024434 100644 --- a/lib/tasks/tag_clustering.rake +++ b/lib/tasks/tag_clustering.rake @@ -19,187 +19,24 @@ require 'json' namespace :tag do desc 'Generate tag clusters' - task :generate_clusters, [:tag_name, :n_los, :infomap] => [:environment] do |_t, args| + task :generate_clusters => [:environment] do + # DEBUG = true - args.with_defaults(n_los: 50, infomap: false) + graph_path = TagSearchService.file_path+".net" + hash = create_hash() + create_pajek_net_graph(hash, graph_path) - if args.tag_name.blank? - puts "\nERROR: You must pass a tag name as argument." - puts "Example usage for searchig with the tag test, result with 50 learning objects and generating a new cluster from infomap:" - puts "$ rails tag:generate_clusters[test,50,true]" - abort - end - - # searched tag - $TAG = Tag.find_by_name(args.tag_name) - - if $TAG.blank? - abort("\nERROR: Tag #{args.tag_name} not found.") - end - - # Padding on tag distance normalization - $PAD = 1 # if == 1, min tag dist will be 0.5 (1 - (max/max*(1+$PAD))) - - # Padding on merge min maxing - $PADM = 0.05 - - # number of results to return - $N_LOS = args.n_los.to_i - - outDIR = "tmp" - fileName = "tags" - path = Rails.root.join(outDIR, fileName).to_s - - if !File.file?(path+".ftree") || args.infomap - graphPath = path+".net" - hash = create_hash() - create_pajek_net_graph(hash, graphPath) - - puts "\nCalling infomap with the pajek graph\n\n" - system("infomap --ftree #{graphPath} #{Rails.root.join(outDIR)}") - end - - # create cluster tree from ftree - tags = parseTree(path) - - puts "\nStarted Calculating relevant results for tag #{$TAG.name}\n" - - relevant_los = find_relevant_results($TAG.id, tags) - - puts "\n============ Learning Objects - Cluster Search ===============\n" - puts "ID: Ranking | Name | Tags\n\n" - relevant_los.each do |id, rank| - lo = LearningObject.find(id) - puts "-----" - p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name) - lo.tags.each {|t| print t.name+" | "} - puts "" - end - - search_los = LearningObject.search $TAG.name, limit: $N_LOS, explain:true - - # print search_los.response["hits"]["hits"].to_json - # p wdfSearch("Educação Infatil") - # p tagWdfSorted() - - puts "\n============ Learning Objects - Elasticsearch ===============\n" - search_los.each do |lo| - puts "#{lo.id}: #{lo.search_hit['_score']}" - end - puts "ID: Ranking | Name | Tags\n\n" - search_los.each do |lo| - puts "-----" - p lo.id.to_s+": "+lo.search_hit["_score"].to_s+" | "+lo.try(:name) - lo.tags.each {|t| print t.name+" | "} - puts "" - end - - merged_los = merge(search_los, relevant_los) - - puts "\n============ Learning Objects - Merged ===============\n" - merged_los.each do |id, rank| - puts "#{id}: #{rank}" - end - puts "ID: Ranking | Name | Tags\n\n" - merged_los.each do |id, rank| - lo = LearningObject.find(id) - puts "-----" - p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name) - lo.tags.each {|t| print t.name+" | "} - puts "" - end + # puts "\nCalling infomap with the pajek graph\n\n" if DEBUG + system("infomap --ftree #{graph_path} #{TagSearchService.root_dir}") + Rails.cache.delete(TagSearchService::CACHE_KEY) end # task private - def tagWdfSorted() - tagWdf = [] - - Tag.all.each_with_index do |t,i| - df = wdfSearch(t.name) - print "ERROR #{t.name}\n" if df == -1 - tagWdf << [t.id, df, t.name] if df != -1 - end - - tagWdf = tagWdf.sort_by { |t| t[1] } - - return tagWdf - end - - def findScores(obj) - if !obj["description"].match(/weight\(\w*\.word_start:/).nil? - if obj["details"][0]["details"][0]["description "] == "boost" && obj["details"][0]["details"][0]["value"] == 10 - if obj["details"][0]["details"][1]["details"][0]["description"] == "docFreq" - return obj["details"][0]["details"][1]["details"][0]["value"] - else - return 0 - end - else - return 0 - end - end - sum = 0 - obj["details"].each do |detail| - sum += findScores(detail) - end - return sum - end - - def wdfSearch(tname) - search_los = LearningObject.search tname, limit: 1, explain:true - return -1 if search_los.blank? - details = search_los.response['hits']['hits'][0]['_explanation'] - - sum = findScores(details) - return sum - end - - def merge(search_los, relevant_los) - puts "\n---------------------- MERGING -----------------------" - merged_los = [] - - max = search_los.first.search_hit['_score'] - min = search_los[search_los.size-1].search_hit['_score'] - # min = 0 - max_boost = 0 - response = search_los.response['hits']['hits'] - search_los.each_with_index do |slo, i| - detail = response[i]['_explanation']['details'][0] - while detail['description'] != "boost" - detail = detail['details'][0] - end - boost = detail['value'] - max_boost = boost if boost > max_boost - - slo.search_hit['_score'] = boost*(slo.search_hit['_score']*(1+$PADM)-min)/(max-min) - end - - max = relevant_los.first[1] - min = relevant_los.last[1] - puts "\nMax boost found: "+max_boost.to_s - relevant_los.each do |rlo| - rlo[1] = max_boost*(rlo[1]*(1+$PADM)-min)/(max-min) - end - - search_los.each do |slo| - relevant_los.each_with_index do |rlo, index| - if slo.id == rlo[0] - slo.search_hit['_score'] = slo.search_hit['_score'] + rlo[1] - relevant_los.delete_at(index) - end - end - merged_los << [slo.id, slo.search_hit['_score']] - end - - merged_los.push(*relevant_los) - merged_los = merged_los.sort_by { |lo| lo[1]*-1 } - return merged_los.first(50) - end - # hash[id1][id2] will equal how many times tags with id1 and id2 appear together on a LO def create_hash() - puts "\nCreating hash of tag concurrences\n" + # puts "\nCreating hash of tag concurrences\n" if DEBUG hash = {} LearningObject.all.each do |lo| @@ -235,9 +72,8 @@ namespace :tag do return hash end - def create_pajek_net_graph(hash, path) - puts "\nCreating pajek net graph on path #{path}\n" + # puts "\nCreating pajek net graph on path #{path}\n" if DEBUG File.open(path, "w+") do |f| f << "*Vertices #{Tag.all.size}\n" @@ -260,200 +96,4 @@ namespace :tag do end end end - - # ranking # - - def ranked_close_tags(tagId, tags) - close_tags = [] - - tags[tagId][:parent][:childs].each do |t1| - # calculate logarithmic distance between tag flows - # lower value, closer, more relevant - # the tag you are searching for will be at distance 0 of itself - lg_dist = ( Math.log2(tags[tagId][:flow]) - Math.log2(t1[:flow]) ).abs - - close_tags << { id: t1[:id], rank: lg_dist} - end - - return normalize_complement_close(close_tags) - end - - # normalizes and complements, 0 distance will be 1, - # max dist will be closest to 0 - def normalize_complement_close(tags) - max = 0 - - # find max rank - tags.each do |t| - max = t[:rank] if t[:rank] > max - end - - # normalize, min will always be 0 - tags.each do |t| - # increase max by $PAD so its rank isn't 0 - t[:rank] = 1 - (t[:rank]/(max*(1 + $PAD))) - end - - return tags - end - - def find_relevant_results(tagId, tags) - los_ranked = {} - - puts "\nGetting tags from the same cluster\n" - puts "Normalization with padding = #{$PAD}\n" - close_tags = ranked_close_tags(tagId, tags) - - # Uncomment the next line if you want to sort by global tag frequency - # freq_cluster = cluster_frequency(close_tags) - - puts "\nStarted Ranking LOs...\n" - - puts "\n====== Close Tags =========\n" - puts "Name | ID | Normalized Ranking\n\n" - close_tags.each do |ct| - tag = Tag.find(ct[:id]) - p tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tags[ct[:id]][:rank].to_s - tag.taggings.where(taggable_type: "LearningObject").each do |tagging| - lo = tagging.taggable - if los_ranked[lo.id].nil? - # Uncomment the next line if you want to sort by local tag frequency - # los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags) - - # Uncomment the next line if you want to sort by global tag frequency - # los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster) - - # Uncomment the next line if you want to sort by tag cluster rank - los_ranked[lo.id] = relevance_raw_rank(lo, close_tags) - end - end - end - puts "============\n" - - puts "\nSorting LOs...\n" - # sorts by its ranking - los_ranked = los_ranked.sort_by { |id, rank| rank } - # get highest ranks - los_ranked = los_ranked.reverse.first($N_LOS) - - los_ranked.each do |key, value| - puts "#{key}: #{value}" - end - - return los_ranked - end - - def relevance_frequency_rank(lo, close_tags) - itf_sum = 0 - - wdf = 0 - wdf = 1/(Math.log2(lo.tags.size)+1) if lo.tags.size != 0 - - lo.tags.each do |t| - close_tags.each do |tag| - if tag[:id] == t.id - itf_sum += tag[:rank]*(Math.log2(Tag.all.size/t.taggings.size)+1) - end - end - end - - return wdf*itf_sum - end - - # returns the sum of how many times each tag in cluster appears in space - def cluster_frequency(cluster) - freq_cluster = 0 - cluster.each do |t| - freq_cluster += Tag.find(t[:id]).taggings.size - end - return freq_cluster - end - - def relevance_frequency_rank_global(lo, close_tags, freq_cluster) - # for each tag in LO that is in the cluster, accumulate it's rank - rank_sum = 1 - lo.tags.each do |t| - close_tags.each do |tag| - if tag[:id] == t.id - rank_sum += tag[:rank] - end - end - end - - wdf = 0 - wdf = (Math.log2(rank_sum)/(Math.log2(lo.tags.size)+1)) if lo.tags.size != 0 - - itf = Math.log2(Tag.all.size/freq_cluster)+1 - - return wdf*itf - end - - # returns the rank sum of the tags in the LO - def relevance_raw_rank(lo, close_tags) - # for each tag in LO that is in the cluster, accumulate it's rank - rank_sum = 0 - lo.tags.each do |t| - close_tags.each do |tag| - if tag[:id] == t.id - rank_sum += tag[:rank] - end - end - end - return rank_sum - end - - def parseTree(path) - # parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes - # create tags list, tags[tagId] == the tag leaf inside the cluster tree - - puts "\nParsing .ftree output into a 'tree'\n" - - clusters = {childs: [], parent: nil} - tags = {} - countClusters = {} - - File.open(path+".ftree", "r") do |f| - f.gets - f.gets - - while line = f.gets - break if !line.include? ':' - - tmp = line.split(' ') - # tmp[0] format like: 2:7:5:4, 2 an 7 have clusters as childs, 5 has leafs, 4 is one of them (tag) - # get id of each level of the tree - ftree = tmp[0].split(':')[0..-2] - # last number of the sequence is the leaf Id - leafId = tmp[0].split(':')[-1].to_i - - # last number on the line is the tag Id - tagId = tmp[-1].to_i - # second number on the line - flow = tmp[1].to_f - # between the third and second to last is where the name lies - name = tmp[2..-2].join(' ')[1..-2] - - # iterate through the levels of the tree - it = clusters # start at the root - ftree.each do |clusterId| - clusterId = clusterId.to_i - 1 # on the file they start at 1, we want 0 - if it[:childs][clusterId].nil? # if this id doesn't exist, create it as child of 'it' - it[:childs][clusterId] = {childs: [], parent: nil} - it[:childs][clusterId][:parent] = it - end - # go down a level - it = it[:childs][clusterId] - end - countClusters[it] = 1 # set this cluster in this hash, for counting purposes - # 'it' is the cluster leafId is a child of, so add it - it[:childs][leafId-1] = {id: tagId, flow: flow, name: name, parent: it} - # put the leaf on this hash for easy acess by the tagId - tags[tagId] = it[:childs][leafId-1] - end - end - - puts "\nNumber of clusters found: #{countClusters.size}\n" - return tags - end - end # namespace -- GitLab