Skip to content
Snippets Groups Projects
Commit bbccc139 authored by Marcela Ribeiro de Oliveira's avatar Marcela Ribeiro de Oliveira
Browse files

Merge branch 'master' of gitlab.c3sl.ufpr.br:portalmec/portalmec into user-profiles

parents cbf39f97 3953e4bc
No related branches found
No related tags found
No related merge requests found
require 'json'
namespace :tag do
desc 'Generate tag clusters'
task :generate_clusters, [:tag_name, :n_los, :infomap] => [:environment] do |_t, args|
args.with_defaults(n_los: 50, infomap: false)
if args.tag_name.blank?
puts "\nERROR: You must pass a tag name as argument."
puts "Example usage for searchig with the tag test, result with 50 learning objects and generating a new cluster from infomap:"
puts "$ rails tag:generate_clusters[test,50,true]"
abort
end
# searched tag
$TAG = Tag.find_by_name(args.tag_name)
if $TAG.blank?
abort("\nERROR: Tag #{args.tag_name} not found.")
end
# Padding on tag distance normalization
$PAD = 1 # if == 1, min tag dist will be 0.5 (1 - (max/max*(1+$PAD)))
# Padding on merge min maxing
$PADM = 0.05
# number of results to return
$N_LOS = args.n_los.to_i
outDIR = "tmp"
fileName = "tags"
path = Rails.root.join(outDIR, fileName).to_s
if !File.file?(path+".ftree") || args.infomap
graphPath = path+".net"
hash = create_hash()
create_pajek_net_graph(hash, graphPath)
puts "\nCalling infomap with the pajek graph\n\n"
system("infomap --ftree #{graphPath} #{Rails.root.join(outDIR)}")
end
# create cluster tree from ftree
tags = parseTree(path)
puts "\nStarted Calculating relevant results for tag #{$TAG.name}\n"
relevant_los = find_relevant_results($TAG.id, tags)
puts "\n============ Learning Objects - Cluster Search ===============\n"
puts "ID: Ranking | Name | Tags\n\n"
relevant_los.each do |id, rank|
lo = LearningObject.find(id)
puts "-----"
p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name)
lo.tags.each {|t| print t.name+" | "}
puts ""
end
search_los = LearningObject.search $TAG.name, limit: $N_LOS, explain:true
# print search_los.response["hits"]["hits"].to_json
# p wdfSearch("Educação Infatil")
# p tagWdfSorted()
puts "\n============ Learning Objects - Elasticsearch ===============\n"
search_los.each do |lo|
puts "#{lo.id}: #{lo.search_hit['_score']}"
end
puts "ID: Ranking | Name | Tags\n\n"
search_los.each do |lo|
puts "-----"
p lo.id.to_s+": "+lo.search_hit["_score"].to_s+" | "+lo.try(:name)
lo.tags.each {|t| print t.name+" | "}
puts ""
end
merged_los = merge(search_los, relevant_los)
puts "\n============ Learning Objects - Merged ===============\n"
merged_los.each do |id, rank|
puts "#{id}: #{rank}"
end
puts "ID: Ranking | Name | Tags\n\n"
merged_los.each do |id, rank|
lo = LearningObject.find(id)
puts "-----"
p lo.id.to_s+": "+rank.to_s+" | "+lo.try(:name)
lo.tags.each {|t| print t.name+" | "}
puts ""
end
end # task
private
def tagWdfSorted()
tagWdf = []
Tag.all.each_with_index do |t,i|
df = wdfSearch(t.name)
print "ERROR #{t.name}\n" if df == -1
tagWdf << [t.id, df, t.name] if df != -1
end
tagWdf = tagWdf.sort_by { |t| t[1] }
return tagWdf
end
def findScores(obj)
if !obj["description"].match(/weight\(\w*\.word_start:/).nil?
if obj["details"][0]["details"][0]["description "] == "boost" && obj["details"][0]["details"][0]["value"] == 10
if obj["details"][0]["details"][1]["details"][0]["description"] == "docFreq"
return obj["details"][0]["details"][1]["details"][0]["value"]
else
return 0
end
else
return 0
end
end
sum = 0
obj["details"].each do |detail|
sum += findScores(detail)
end
return sum
end
def wdfSearch(tname)
search_los = LearningObject.search tname, limit: 1, explain:true
return -1 if search_los.blank?
details = search_los.response['hits']['hits'][0]['_explanation']
sum = findScores(details)
return sum
end
def merge(search_los, relevant_los)
merged_los = []
max = search_los.first.search_hit['_score']
min = search_los[search_los.size-1].search_hit['_score']
# min = 0
max_boost = 0
response = search_los.response['hits']['hits']
search_los.each_with_index do |slo, i|
detail = response[i]['_explanation']['details'][0]
while detail['description'] != "boost"
detail = detail['details'][0]
end
boost = detail['value']
max_boost = boost if boost > max_boost
slo.search_hit['_score'] = boost*(slo.search_hit['_score']*(1+$PADM)-min)/(max-min)
end
max = relevant_los.first[1]
min = relevant_los.last[1]
relevant_los.each do |rlo|
rlo[1] = max_boost*(rlo[1]*(1+$PADM)-min)/(max-min)
end
search_los.each do |slo|
relevant_los.each_with_index do |rlo, index|
if slo.id == rlo[0]
slo.search_hit['_score'] = slo.search_hit['_score'] + rlo[1]
relevant_los.delete_at(index)
end
end
merged_los << [slo.id, slo.search_hit['_score']]
end
merged_los.push(*relevant_los)
merged_los = merged_los.sort_by { |lo| lo[1]*-1 }
return merged_los.first(50)
end
# hash[id1][id2] will equal how many times tags with id1 and id2 appear together on a LO
def create_hash()
puts "\nCreating hash of tag concurrences\n"
hash = {}
LearningObject.all.each do |lo|
# for each lo, count tags and tag pairs and add to hash
# if id1 <= id2
lo.tags.each.with_index do |t, i|
# initialize value
hash[t.id] = {} if hash[t.id].nil?
hash[t.id][t.id] = 0 if hash[t.id][t.id].nil?
hash[t.id][t.id] += 1
# for each next tags (with higher index)
lo.tags.drop(i+1).each do |t2|
# [t1][t2], t1 should always be lower
if t.id > t2.id
# swaps
t, t2 = t2, t
# check nil
hash[t.id] = {} if hash[t.id].nil?
end
# initialize value
if hash[t.id][t2.id].nil?
hash[t.id][t2.id] = 0
end
hash[t.id][t2.id] += 1
end
end
end
return hash
end
def create_pajek_net_graph(hash, path)
puts "\nCreating pajek net graph on path #{path}\n"
File.open(path, "w+") do |f|
f << "*Vertices #{Tag.all.size}\n"
tag_index = {}
Tag.all.each_with_index do |t,i|
f << "#{i+1} \"#{t.name}\"\n"
tag_index[t.id] = i+1
end
f << "*Edges\n"
hash.each do |id1, ids2Hash|
ids2Hash.each do |id2, value|
if id1 != id2
f << "#{tag_index[id1]} #{tag_index[id2]} \
#{hash[id1][id2].to_f/(Math.sqrt(hash[id1][id1]*hash[id2][id2]))}\n"
# weight of the edge is the cos distance
end
end
end
end
end
# ranking #
def ranked_close_tags(tagId, tags)
close_tags = []
tags[tagId][:parent][:childs].each do |t1|
# calculate logarithmic distance between tag flows
# lower value, closer, more relevant
# the tag you are searching for will be at distance 0 of itself
lg_dist = ( Math.log2(tags[tagId][:flow]) - Math.log2(t1[:flow]) ).abs
close_tags << { id: t1[:id], rank: lg_dist}
end
return normalize_complement_close(close_tags)
end
# normalizes and complements, 0 distance will be 1,
# max dist will be closest to 0
def normalize_complement_close(tags)
max = 0
# find max rank
tags.each do |t|
max = t[:rank] if t[:rank] > max
end
# normalize, min will always be 0
tags.each do |t|
# increase max by $PAD so its rank isn't 0
t[:rank] = 1 - (t[:rank]/(max*(1 + $PAD)))
end
return tags
end
def find_relevant_results(tagId, tags)
los_ranked = {}
puts "\nGetting tags from the same cluster\n"
puts "Normalization with padding = #{$PAD}\n"
close_tags = ranked_close_tags(tagId, tags)
# Uncomment the next line if you want to sort by global tag frequency
# freq_cluster = cluster_frequency(close_tags)
puts "\nStarted Ranking LOs...\n"
puts "\n====== Close Tags =========\n"
puts "Name | ID | Normalized Ranking\n\n"
close_tags.each do |ct|
tag = Tag.find(ct[:id])
p tag.name+" | "+ct[:id].to_s+" | "+ct[:rank].to_s+" | "+tags[ct[:id]][:rank].to_s
tag.taggings.where(taggable_type: "LearningObject").each do |tagging|
lo = tagging.taggable
if los_ranked[lo.id].nil?
# Uncomment the next line if you want to sort by local tag frequency
# los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags)
# Uncomment the next line if you want to sort by global tag frequency
# los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster)
# Uncomment the next line if you want to sort by tag cluster rank
los_ranked[lo.id] = relevance_raw_rank(lo, close_tags)
end
end
end
puts "============\n"
puts "\nSorting LOs...\n"
# sorts by its ranking
los_ranked = los_ranked.sort_by { |id, rank| rank }
# get highest ranks
los_ranked = los_ranked.reverse.first($N_LOS)
los_ranked.each do |key, value|
puts "#{key}: #{value}"
end
return los_ranked
end
def relevance_frequency_rank(lo, close_tags)
itf_sum = 0
wdf = 0
wdf = 1/(Math.log2(lo.tags.size)+1) if lo.tags.size != 0
lo.tags.each do |t|
close_tags.each do |tag|
if tag[:id] == t.id
itf_sum += tag[:rank]*(Math.log2(Tag.all.size/t.taggings.size)+1)
end
end
end
return wdf*itf_sum
end
# returns the sum of how many times each tag in cluster appears in space
def cluster_frequency(cluster)
freq_cluster = 0
cluster.each do |t|
freq_cluster += Tag.find(t[:id]).taggings.size
end
return freq_cluster
end
def relevance_frequency_rank_global(lo, close_tags, freq_cluster)
# for each tag in LO that is in the cluster, accumulate it's rank
rank_sum = 1
lo.tags.each do |t|
close_tags.each do |tag|
if tag[:id] == t.id
rank_sum += tag[:rank]
end
end
end
wdf = 0
wdf = (Math.log2(rank_sum)/(Math.log2(lo.tags.size)+1)) if lo.tags.size != 0
itf = Math.log2(Tag.all.size/freq_cluster)+1
return wdf*itf
end
# returns the rank sum of the tags in the LO
def relevance_raw_rank(lo, close_tags)
# for each tag in LO that is in the cluster, accumulate it's rank
rank_sum = 0
lo.tags.each do |t|
close_tags.each do |tag|
if tag[:id] == t.id
rank_sum += tag[:rank]
end
end
end
return rank_sum
end
def parseTree(path)
# parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes
# create tags list, tags[tagId] == the tag leaf inside the cluster tree
puts "\nParsing .ftree output into a 'tree'\n"
clusters = {childs: [], parent: nil}
tags = {}
countClusters = {}
File.open(path+".ftree", "r") do |f|
f.gets
f.gets
while line = f.gets
break if !line.include? ':'
tmp = line.split(' ')
# tmp[0] format like: 2:7:5:4, 2 an 7 have clusters as childs, 5 has leafs, 4 is one of them (tag)
# get id of each level of the tree
ftree = tmp[0].split(':')[0..-2]
# last number of the sequence is the leaf Id
leafId = tmp[0].split(':')[-1].to_i
# last number on the line is the tag Id
tagId = tmp[-1].to_i
# second number on the line
flow = tmp[1].to_f
# between the third and second to last is where the name lies
name = tmp[2..-2].join(' ')[1..-2]
# iterate through the levels of the tree
it = clusters # start at the root
ftree.each do |clusterId|
clusterId = clusterId.to_i - 1 # on the file they start at 1, we want 0
if it[:childs][clusterId].nil? # if this id doesn't exist, create it as child of 'it'
it[:childs][clusterId] = {childs: [], parent: nil}
it[:childs][clusterId][:parent] = it
end
# go down a level
it = it[:childs][clusterId]
end
countClusters[it] = 1 # set this cluster in this hash, for counting purposes
# 'it' is the cluster leafId is a child of, so add it
it[:childs][leafId-1] = {id: tagId, flow: flow, name: name, parent: it}
# put the leaf on this hash for easy acess by the tagId
tags[tagId] = it[:childs][leafId-1]
end
end
puts "\nNumber of clusters found: #{countClusters.size}\n"
return tags
end
end # namespace
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment