diff --git a/app/controllers/v1/feed_controller.rb b/app/controllers/v1/feed_controller.rb index 04eabf5f0a97f53ee5903d420c68651378eb06c5..98892dbb38a345b005d38689b62bf4651b4250bb 100644 --- a/app/controllers/v1/feed_controller.rb +++ b/app/controllers/v1/feed_controller.rb @@ -36,8 +36,10 @@ class V1::FeedController < ApplicationController query = "" values = [""] + # builds a query string to find all relevant activities current_user.watching.each do |watching| if !watching.respond_to?(:state) || watching.state == "published" + # Activities that are made by, owned by, or change the object you follow should be found query += " (trackable_type = ? and trackable_id = ?) or (owner_type = ? and owner_id = ?) or (recipient_type = ? and recipient_id = ?) or" values << watching.class.to_s values << watching.id @@ -48,8 +50,8 @@ class V1::FeedController < ApplicationController end end - values[0] = query[0..-3] - PublicActivity::Activity.where(key: activities_filter).where(values).order(created_at: :desc) + values[0] = query[0..-3] # remove trailing "or" on the query + return PublicActivity::Activity.where(key: activities_filter).where(values).order(created_at: :desc) end end diff --git a/lib/tasks/tag_clustering.rake b/lib/tasks/tag_clustering.rake new file mode 100644 index 0000000000000000000000000000000000000000..414d2e53035605598c65c7d6e14432d6d4b8725f --- /dev/null +++ b/lib/tasks/tag_clustering.rake @@ -0,0 +1,441 @@ +require 'json' +namespace :tag do + desc 'Generate tag clusters' + task :generate_clusters, [:tag_name, :n_los, :infomap] => [:environment] do |_t, args| + + args.with_defaults(n_los: 50, infomap: false) + + if args.tag_name.blank? + puts "\nERROR: You must pass a tag name as argument." + puts "Example usage for searchig with the tag test, result with 50 learning objects and generating a new cluster from infomap:" + puts "$ rails tag:generate_clusters[test,50,true]" + abort + end + + # searched tag + $TAG = Tag.find_by_name(args.tag_name) + + if $TAG.blank? + abort("\nERROR: Tag #{args.tag_name} not found.") + end + + # Padding on tag distance normalization + $PAD = 1 # if == 1, min tag dist will be 0.5 (1 - (max/max*(1+$PAD))) + + # Padding on merge min maxing + $PADM = 0.05 + + # number of results to return + $N_LOS = args.n_los.to_i + + outDIR = "tmp" + fileName = "tags" + path = Rails.root.join(outDIR, fileName).to_s + + if !File.file?(path+".ftree") || args.infomap + graphPath = path+".net" + hash = create_hash() + create_pajek_net_graph(hash, graphPath) + + puts "\nCalling infomap with the pajek graph\n\n" + system("infomap --ftree #{graphPath} #{Rails.root.join(outDIR)}") + end + + # create cluster tree from ftree + tags = parseTree(path) + + puts "\nStarted Calculating relevant results for tag #{$TAG.name}\n" + + relevant_los = find_relevant_results($TAG.id, tags) + + puts "\n============ Learning Objects - Cluster Search ===============\n" + puts "ID: Ranking | Name | Tags\n\n" + relevant_los.each do |id, rank| + lo = LearningObject.find(id) + puts "-----" + p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name) + lo.tags.each {|t| print t.name+" | "} + puts "" + end + + search_los = LearningObject.search $TAG.name, limit: $N_LOS, explain:true + + # print search_los.response["hits"]["hits"].to_json + # p wdfSearch("Educação Infatil") + # p tagWdfSorted() + + puts "\n============ Learning Objects - Elasticsearch ===============\n" + search_los.each do |lo| + puts "#{lo.id}: #{lo.search_hit['_score']}" + end + puts "ID: Ranking | Name | Tags\n\n" + search_los.each do |lo| + puts "-----" + p lo.id.to_s+": "+lo.search_hit["_score"].to_s+" | "+lo.try(:name) + lo.tags.each {|t| print t.name+" | "} + puts "" + end + + merged_los = merge(search_los, relevant_los) + + puts "\n============ Learning Objects - Merged ===============\n" + merged_los.each do |id, rank| + puts "#{id}: #{rank}" + end + puts "ID: Ranking | Name | Tags\n\n" + merged_los.each do |id, rank| + lo = LearningObject.find(id) + puts "-----" + p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name) + lo.tags.each {|t| print t.name+" | "} + puts "" + end + + end # task + + private + + def tagWdfSorted() + tagWdf = [] + + Tag.all.each_with_index do |t,i| + df = wdfSearch(t.name) + print "ERROR #{t.name}\n" if df == -1 + tagWdf << [t.id, df, t.name] if df != -1 + end + + tagWdf = tagWdf.sort_by { |t| t[1] } + + return tagWdf + end + + def findScores(obj) + if !obj["description"].match(/weight\(\w*\.word_start:/).nil? + if obj["details"][0]["details"][0]["description "] == "boost" && obj["details"][0]["details"][0]["value"] == 10 + if obj["details"][0]["details"][1]["details"][0]["description"] == "docFreq" + return obj["details"][0]["details"][1]["details"][0]["value"] + else + return 0 + end + else + return 0 + end + end + sum = 0 + obj["details"].each do |detail| + sum += findScores(detail) + end + return sum + end + + def wdfSearch(tname) + search_los = LearningObject.search tname, limit: 1, explain:true + return -1 if search_los.blank? + details = search_los.response['hits']['hits'][0]['_explanation'] + + sum = findScores(details) + return sum + end + + def merge(search_los, relevant_los) + puts "\n---------------------- MERGING -----------------------" + merged_los = [] + + max = search_los.first.search_hit['_score'] + min = search_los[search_los.size-1].search_hit['_score'] + # min = 0 + max_boost = 0 + response = search_los.response['hits']['hits'] + search_los.each_with_index do |slo, i| + detail = response[i]['_explanation']['details'][0] + while detail['description'] != "boost" + detail = detail['details'][0] + end + boost = detail['value'] + max_boost = boost if boost > max_boost + + slo.search_hit['_score'] = boost*(slo.search_hit['_score']*(1+$PADM)-min)/(max-min) + end + + max = relevant_los.first[1] + min = relevant_los.last[1] + puts "\nMax boost found: "+max_boost.to_s + relevant_los.each do |rlo| + rlo[1] = max_boost*(rlo[1]*(1+$PADM)-min)/(max-min) + end + + search_los.each do |slo| + relevant_los.each_with_index do |rlo, index| + if slo.id == rlo[0] + slo.search_hit['_score'] = slo.search_hit['_score'] + rlo[1] + relevant_los.delete_at(index) + end + end + merged_los << [slo.id, slo.search_hit['_score']] + end + + merged_los.push(*relevant_los) + merged_los = merged_los.sort_by { |lo| lo[1]*-1 } + return merged_los.first(50) + end + + # hash[id1][id2] will equal how many times tags with id1 and id2 appear together on a LO + def create_hash() + puts "\nCreating hash of tag concurrences\n" + hash = {} + + LearningObject.all.each do |lo| + # for each lo, count tags and tag pairs and add to hash + # if id1 <= id2 + lo.tags.each.with_index do |t, i| + # initialize value + hash[t.id] = {} if hash[t.id].nil? + hash[t.id][t.id] = 0 if hash[t.id][t.id].nil? + + hash[t.id][t.id] += 1 + + # for each next tags (with higher index) + lo.tags.drop(i+1).each do |t2| + # [t1][t2], t1 should always be lower + if t.id > t2.id + # swaps + t, t2 = t2, t + # check nil + hash[t.id] = {} if hash[t.id].nil? + end + + # initialize value + if hash[t.id][t2.id].nil? + hash[t.id][t2.id] = 0 + end + + hash[t.id][t2.id] += 1 + end + end + end + + return hash + end + + + def create_pajek_net_graph(hash, path) + puts "\nCreating pajek net graph on path #{path}\n" + File.open(path, "w+") do |f| + f << "*Vertices #{Tag.all.size}\n" + + tag_index = {} + Tag.all.each_with_index do |t,i| + f << "#{i+1} \"#{t.name}\"\n" + tag_index[t.id] = i+1 + end + + f << "*Edges\n" + + hash.each do |id1, ids2Hash| + ids2Hash.each do |id2, value| + if id1 != id2 + f << "#{tag_index[id1]} #{tag_index[id2]} \ + #{hash[id1][id2].to_f/(Math.sqrt(hash[id1][id1]*hash[id2][id2]))}\n" + # weight of the edge is the cos distance + end + end + end + end + end + + # ranking # + + def ranked_close_tags(tagId, tags) + close_tags = [] + + tags[tagId][:parent][:childs].each do |t1| + # calculate logarithmic distance between tag flows + # lower value, closer, more relevant + # the tag you are searching for will be at distance 0 of itself + lg_dist = ( Math.log2(tags[tagId][:flow]) - Math.log2(t1[:flow]) ).abs + + close_tags << { id: t1[:id], rank: lg_dist} + end + + return normalize_complement_close(close_tags) + end + + # normalizes and complements, 0 distance will be 1, + # max dist will be closest to 0 + def normalize_complement_close(tags) + max = 0 + + # find max rank + tags.each do |t| + max = t[:rank] if t[:rank] > max + end + + # normalize, min will always be 0 + tags.each do |t| + # increase max by $PAD so its rank isn't 0 + t[:rank] = 1 - (t[:rank]/(max*(1 + $PAD))) + end + + return tags + end + + def find_relevant_results(tagId, tags) + los_ranked = {} + + puts "\nGetting tags from the same cluster\n" + puts "Normalization with padding = #{$PAD}\n" + close_tags = ranked_close_tags(tagId, tags) + + # Uncomment the next line if you want to sort by global tag frequency + # freq_cluster = cluster_frequency(close_tags) + + puts "\nStarted Ranking LOs...\n" + + puts "\n====== Close Tags =========\n" + puts "Name | ID | Normalized Ranking\n\n" + close_tags.each do |ct| + tag = Tag.find(ct[:id]) + p tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tags[ct[:id]][:rank].to_s + tag.taggings.where(taggable_type: "LearningObject").each do |tagging| + lo = tagging.taggable + if los_ranked[lo.id].nil? + # Uncomment the next line if you want to sort by local tag frequency + # los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags) + + # Uncomment the next line if you want to sort by global tag frequency + # los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster) + + # Uncomment the next line if you want to sort by tag cluster rank + los_ranked[lo.id] = relevance_raw_rank(lo, close_tags) + end + end + end + puts "============\n" + + puts "\nSorting LOs...\n" + # sorts by its ranking + los_ranked = los_ranked.sort_by { |id, rank| rank } + # get highest ranks + los_ranked = los_ranked.reverse.first($N_LOS) + + los_ranked.each do |key, value| + puts "#{key}: #{value}" + end + + return los_ranked + end + + def relevance_frequency_rank(lo, close_tags) + itf_sum = 0 + + wdf = 0 + wdf = 1/(Math.log2(lo.tags.size)+1) if lo.tags.size != 0 + + lo.tags.each do |t| + close_tags.each do |tag| + if tag[:id] == t.id + itf_sum += tag[:rank]*(Math.log2(Tag.all.size/t.taggings.size)+1) + end + end + end + + return wdf*itf_sum + end + + # returns the sum of how many times each tag in cluster appears in space + def cluster_frequency(cluster) + freq_cluster = 0 + cluster.each do |t| + freq_cluster += Tag.find(t[:id]).taggings.size + end + return freq_cluster + end + + def relevance_frequency_rank_global(lo, close_tags, freq_cluster) + # for each tag in LO that is in the cluster, accumulate it's rank + rank_sum = 1 + lo.tags.each do |t| + close_tags.each do |tag| + if tag[:id] == t.id + rank_sum += tag[:rank] + end + end + end + + wdf = 0 + wdf = (Math.log2(rank_sum)/(Math.log2(lo.tags.size)+1)) if lo.tags.size != 0 + + itf = Math.log2(Tag.all.size/freq_cluster)+1 + + return wdf*itf + end + + # returns the rank sum of the tags in the LO + def relevance_raw_rank(lo, close_tags) + # for each tag in LO that is in the cluster, accumulate it's rank + rank_sum = 0 + lo.tags.each do |t| + close_tags.each do |tag| + if tag[:id] == t.id + rank_sum += tag[:rank] + end + end + end + return rank_sum + end + + def parseTree(path) + # parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes + # create tags list, tags[tagId] == the tag leaf inside the cluster tree + + puts "\nParsing .ftree output into a 'tree'\n" + + clusters = {childs: [], parent: nil} + tags = {} + countClusters = {} + + File.open(path+".ftree", "r") do |f| + f.gets + f.gets + + while line = f.gets + break if !line.include? ':' + + tmp = line.split(' ') + # tmp[0] format like: 2:7:5:4, 2 an 7 have clusters as childs, 5 has leafs, 4 is one of them (tag) + # get id of each level of the tree + ftree = tmp[0].split(':')[0..-2] + # last number of the sequence is the leaf Id + leafId = tmp[0].split(':')[-1].to_i + + # last number on the line is the tag Id + tagId = tmp[-1].to_i + # second number on the line + flow = tmp[1].to_f + # between the third and second to last is where the name lies + name = tmp[2..-2].join(' ')[1..-2] + + # iterate through the levels of the tree + it = clusters # start at the root + ftree.each do |clusterId| + clusterId = clusterId.to_i - 1 # on the file they start at 1, we want 0 + if it[:childs][clusterId].nil? # if this id doesn't exist, create it as child of 'it' + it[:childs][clusterId] = {childs: [], parent: nil} + it[:childs][clusterId][:parent] = it + end + # go down a level + it = it[:childs][clusterId] + end + countClusters[it] = 1 # set this cluster in this hash, for counting purposes + # 'it' is the cluster leafId is a child of, so add it + it[:childs][leafId-1] = {id: tagId, flow: flow, name: name, parent: it} + # put the leaf on this hash for easy acess by the tagId + tags[tagId] = it[:childs][leafId-1] + end + end + + puts "\nNumber of clusters found: #{countClusters.size}\n" + return tags + end + +end # namespace