Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
cleaning-portalmec
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Richard Fernando Heise Ferreira
cleaning-portalmec
Commits
bbccc139
Commit
bbccc139
authored
7 years ago
by
Marcela Ribeiro de Oliveira
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of gitlab.c3sl.ufpr.br:portalmec/portalmec into user-profiles
parents
cbf39f97
3953e4bc
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
lib/tasks/tag_clustering.rake
+440
-0
440 additions, 0 deletions
lib/tasks/tag_clustering.rake
with
440 additions
and
0 deletions
lib/tasks/tag_clustering.rake
0 → 100644
+
440
−
0
View file @
bbccc139
require
'json'
namespace
:tag
do
desc
'Generate tag clusters'
task
:generate_clusters
,
[
:tag_name
,
:n_los
,
:infomap
]
=>
[
:environment
]
do
|
_t
,
args
|
args
.
with_defaults
(
n_los:
50
,
infomap:
false
)
if
args
.
tag_name
.
blank?
puts
"
\n
ERROR: You must pass a tag name as argument."
puts
"Example usage for searchig with the tag test, result with 50 learning objects and generating a new cluster from infomap:"
puts
"$ rails tag:generate_clusters[test,50,true]"
abort
end
# searched tag
$TAG
=
Tag
.
find_by_name
(
args
.
tag_name
)
if
$TAG
.
blank?
abort
(
"
\n
ERROR: Tag
#{
args
.
tag_name
}
not found."
)
end
# Padding on tag distance normalization
$PAD
=
1
# if == 1, min tag dist will be 0.5 (1 - (max/max*(1+$PAD)))
# Padding on merge min maxing
$PADM
=
0.05
# number of results to return
$N_LOS
=
args
.
n_los
.
to_i
outDIR
=
"tmp"
fileName
=
"tags"
path
=
Rails
.
root
.
join
(
outDIR
,
fileName
).
to_s
if
!
File
.
file?
(
path
+
".ftree"
)
||
args
.
infomap
graphPath
=
path
+
".net"
hash
=
create_hash
()
create_pajek_net_graph
(
hash
,
graphPath
)
puts
"
\n
Calling infomap with the pajek graph
\n\n
"
system
(
"infomap --ftree
#{
graphPath
}
#{
Rails
.
root
.
join
(
outDIR
)
}
"
)
end
# create cluster tree from ftree
tags
=
parseTree
(
path
)
puts
"
\n
Started Calculating relevant results for tag
#{
$TAG
.
name
}
\n
"
relevant_los
=
find_relevant_results
(
$TAG
.
id
,
tags
)
puts
"
\n
============ Learning Objects - Cluster Search ===============
\n
"
puts
"ID: Ranking | Name | Tags
\n\n
"
relevant_los
.
each
do
|
id
,
rank
|
lo
=
LearningObject
.
find
(
id
)
puts
"-----"
p
lo
.
id
.
to_s
+
": "
+
rank
.
to_s
+
" | "
+
lo
.
try
(
:name
)
lo
.
tags
.
each
{
|
t
|
print
t
.
name
+
" | "
}
puts
""
end
search_los
=
LearningObject
.
search
$TAG
.
name
,
limit:
$N_LOS
,
explain
:true
# print search_los.response["hits"]["hits"].to_json
# p wdfSearch("Educação Infatil")
# p tagWdfSorted()
puts
"
\n
============ Learning Objects - Elasticsearch ===============
\n
"
search_los
.
each
do
|
lo
|
puts
"
#{
lo
.
id
}
:
#{
lo
.
search_hit
[
'_score'
]
}
"
end
puts
"ID: Ranking | Name | Tags
\n\n
"
search_los
.
each
do
|
lo
|
puts
"-----"
p
lo
.
id
.
to_s
+
": "
+
lo
.
search_hit
[
"_score"
].
to_s
+
" | "
+
lo
.
try
(
:name
)
lo
.
tags
.
each
{
|
t
|
print
t
.
name
+
" | "
}
puts
""
end
merged_los
=
merge
(
search_los
,
relevant_los
)
puts
"
\n
============ Learning Objects - Merged ===============
\n
"
merged_los
.
each
do
|
id
,
rank
|
puts
"
#{
id
}
:
#{
rank
}
"
end
puts
"ID: Ranking | Name | Tags
\n\n
"
merged_los
.
each
do
|
id
,
rank
|
lo
=
LearningObject
.
find
(
id
)
puts
"-----"
p
lo
.
id
.
to_s
+
": "
+
rank
.
to_s
+
" | "
+
lo
.
try
(
:name
)
lo
.
tags
.
each
{
|
t
|
print
t
.
name
+
" | "
}
puts
""
end
end
# task
private
def
tagWdfSorted
()
tagWdf
=
[]
Tag
.
all
.
each_with_index
do
|
t
,
i
|
df
=
wdfSearch
(
t
.
name
)
print
"ERROR
#{
t
.
name
}
\n
"
if
df
==
-
1
tagWdf
<<
[
t
.
id
,
df
,
t
.
name
]
if
df
!=
-
1
end
tagWdf
=
tagWdf
.
sort_by
{
|
t
|
t
[
1
]
}
return
tagWdf
end
def
findScores
(
obj
)
if
!
obj
[
"description"
].
match
(
/weight\(\w*\.word_start:/
).
nil?
if
obj
[
"details"
][
0
][
"details"
][
0
][
"description "
]
==
"boost"
&&
obj
[
"details"
][
0
][
"details"
][
0
][
"value"
]
==
10
if
obj
[
"details"
][
0
][
"details"
][
1
][
"details"
][
0
][
"description"
]
==
"docFreq"
return
obj
[
"details"
][
0
][
"details"
][
1
][
"details"
][
0
][
"value"
]
else
return
0
end
else
return
0
end
end
sum
=
0
obj
[
"details"
].
each
do
|
detail
|
sum
+=
findScores
(
detail
)
end
return
sum
end
def
wdfSearch
(
tname
)
search_los
=
LearningObject
.
search
tname
,
limit:
1
,
explain
:true
return
-
1
if
search_los
.
blank?
details
=
search_los
.
response
[
'hits'
][
'hits'
][
0
][
'_explanation'
]
sum
=
findScores
(
details
)
return
sum
end
def
merge
(
search_los
,
relevant_los
)
merged_los
=
[]
max
=
search_los
.
first
.
search_hit
[
'_score'
]
min
=
search_los
[
search_los
.
size
-
1
].
search_hit
[
'_score'
]
# min = 0
max_boost
=
0
response
=
search_los
.
response
[
'hits'
][
'hits'
]
search_los
.
each_with_index
do
|
slo
,
i
|
detail
=
response
[
i
][
'_explanation'
][
'details'
][
0
]
while
detail
[
'description'
]
!=
"boost"
detail
=
detail
[
'details'
][
0
]
end
boost
=
detail
[
'value'
]
max_boost
=
boost
if
boost
>
max_boost
slo
.
search_hit
[
'_score'
]
=
boost
*
(
slo
.
search_hit
[
'_score'
]
*
(
1
+
$PADM
)
-
min
)
/
(
max
-
min
)
end
max
=
relevant_los
.
first
[
1
]
min
=
relevant_los
.
last
[
1
]
relevant_los
.
each
do
|
rlo
|
rlo
[
1
]
=
max_boost
*
(
rlo
[
1
]
*
(
1
+
$PADM
)
-
min
)
/
(
max
-
min
)
end
search_los
.
each
do
|
slo
|
relevant_los
.
each_with_index
do
|
rlo
,
index
|
if
slo
.
id
==
rlo
[
0
]
slo
.
search_hit
[
'_score'
]
=
slo
.
search_hit
[
'_score'
]
+
rlo
[
1
]
relevant_los
.
delete_at
(
index
)
end
end
merged_los
<<
[
slo
.
id
,
slo
.
search_hit
[
'_score'
]]
end
merged_los
.
push
(
*
relevant_los
)
merged_los
=
merged_los
.
sort_by
{
|
lo
|
lo
[
1
]
*-
1
}
return
merged_los
.
first
(
50
)
end
# hash[id1][id2] will equal how many times tags with id1 and id2 appear together on a LO
def
create_hash
()
puts
"
\n
Creating hash of tag concurrences
\n
"
hash
=
{}
LearningObject
.
all
.
each
do
|
lo
|
# for each lo, count tags and tag pairs and add to hash
# if id1 <= id2
lo
.
tags
.
each
.
with_index
do
|
t
,
i
|
# initialize value
hash
[
t
.
id
]
=
{}
if
hash
[
t
.
id
].
nil?
hash
[
t
.
id
][
t
.
id
]
=
0
if
hash
[
t
.
id
][
t
.
id
].
nil?
hash
[
t
.
id
][
t
.
id
]
+=
1
# for each next tags (with higher index)
lo
.
tags
.
drop
(
i
+
1
).
each
do
|
t2
|
# [t1][t2], t1 should always be lower
if
t
.
id
>
t2
.
id
# swaps
t
,
t2
=
t2
,
t
# check nil
hash
[
t
.
id
]
=
{}
if
hash
[
t
.
id
].
nil?
end
# initialize value
if
hash
[
t
.
id
][
t2
.
id
].
nil?
hash
[
t
.
id
][
t2
.
id
]
=
0
end
hash
[
t
.
id
][
t2
.
id
]
+=
1
end
end
end
return
hash
end
def
create_pajek_net_graph
(
hash
,
path
)
puts
"
\n
Creating pajek net graph on path
#{
path
}
\n
"
File
.
open
(
path
,
"w+"
)
do
|
f
|
f
<<
"*Vertices
#{
Tag
.
all
.
size
}
\n
"
tag_index
=
{}
Tag
.
all
.
each_with_index
do
|
t
,
i
|
f
<<
"
#{
i
+
1
}
\"
#{
t
.
name
}
\"\n
"
tag_index
[
t
.
id
]
=
i
+
1
end
f
<<
"*Edges
\n
"
hash
.
each
do
|
id1
,
ids2Hash
|
ids2Hash
.
each
do
|
id2
,
value
|
if
id1
!=
id2
f
<<
"
#{
tag_index
[
id1
]
}
#{
tag_index
[
id2
]
}
\
#{
hash
[
id1
][
id2
].
to_f
/
(
Math
.
sqrt
(
hash
[
id1
][
id1
]
*
hash
[
id2
][
id2
]))
}
\n
"
# weight of the edge is the cos distance
end
end
end
end
end
# ranking #
def
ranked_close_tags
(
tagId
,
tags
)
close_tags
=
[]
tags
[
tagId
][
:parent
][
:childs
].
each
do
|
t1
|
# calculate logarithmic distance between tag flows
# lower value, closer, more relevant
# the tag you are searching for will be at distance 0 of itself
lg_dist
=
(
Math
.
log2
(
tags
[
tagId
][
:flow
])
-
Math
.
log2
(
t1
[
:flow
])
).
abs
close_tags
<<
{
id:
t1
[
:id
],
rank:
lg_dist
}
end
return
normalize_complement_close
(
close_tags
)
end
# normalizes and complements, 0 distance will be 1,
# max dist will be closest to 0
def
normalize_complement_close
(
tags
)
max
=
0
# find max rank
tags
.
each
do
|
t
|
max
=
t
[
:rank
]
if
t
[
:rank
]
>
max
end
# normalize, min will always be 0
tags
.
each
do
|
t
|
# increase max by $PAD so its rank isn't 0
t
[
:rank
]
=
1
-
(
t
[
:rank
]
/
(
max
*
(
1
+
$PAD
)))
end
return
tags
end
def
find_relevant_results
(
tagId
,
tags
)
los_ranked
=
{}
puts
"
\n
Getting tags from the same cluster
\n
"
puts
"Normalization with padding =
#{
$PAD
}
\n
"
close_tags
=
ranked_close_tags
(
tagId
,
tags
)
# Uncomment the next line if you want to sort by global tag frequency
# freq_cluster = cluster_frequency(close_tags)
puts
"
\n
Started Ranking LOs...
\n
"
puts
"
\n
====== Close Tags =========
\n
"
puts
"Name | ID | Normalized Ranking
\n\n
"
close_tags
.
each
do
|
ct
|
tag
=
Tag
.
find
(
ct
[
:id
])
p
tag
.
name
+
" | "
+
ct
[
:id
].
to_s
+
" | "
+
ct
[
:rank
].
to_s
+
" | "
+
tags
[
ct
[
:id
]][
:rank
].
to_s
tag
.
taggings
.
where
(
taggable_type:
"LearningObject"
).
each
do
|
tagging
|
lo
=
tagging
.
taggable
if
los_ranked
[
lo
.
id
].
nil?
# Uncomment the next line if you want to sort by local tag frequency
# los_ranked[lo.id] = relevance_frequency_rank(lo, close_tags)
# Uncomment the next line if you want to sort by global tag frequency
# los_ranked[lo.id] = relevance_frequency_rank_global(lo, close_tags, freq_cluster)
# Uncomment the next line if you want to sort by tag cluster rank
los_ranked
[
lo
.
id
]
=
relevance_raw_rank
(
lo
,
close_tags
)
end
end
end
puts
"============
\n
"
puts
"
\n
Sorting LOs...
\n
"
# sorts by its ranking
los_ranked
=
los_ranked
.
sort_by
{
|
id
,
rank
|
rank
}
# get highest ranks
los_ranked
=
los_ranked
.
reverse
.
first
(
$N_LOS
)
los_ranked
.
each
do
|
key
,
value
|
puts
"
#{
key
}
:
#{
value
}
"
end
return
los_ranked
end
def
relevance_frequency_rank
(
lo
,
close_tags
)
itf_sum
=
0
wdf
=
0
wdf
=
1
/
(
Math
.
log2
(
lo
.
tags
.
size
)
+
1
)
if
lo
.
tags
.
size
!=
0
lo
.
tags
.
each
do
|
t
|
close_tags
.
each
do
|
tag
|
if
tag
[
:id
]
==
t
.
id
itf_sum
+=
tag
[
:rank
]
*
(
Math
.
log2
(
Tag
.
all
.
size
/
t
.
taggings
.
size
)
+
1
)
end
end
end
return
wdf
*
itf_sum
end
# returns the sum of how many times each tag in cluster appears in space
def
cluster_frequency
(
cluster
)
freq_cluster
=
0
cluster
.
each
do
|
t
|
freq_cluster
+=
Tag
.
find
(
t
[
:id
]).
taggings
.
size
end
return
freq_cluster
end
def
relevance_frequency_rank_global
(
lo
,
close_tags
,
freq_cluster
)
# for each tag in LO that is in the cluster, accumulate it's rank
rank_sum
=
1
lo
.
tags
.
each
do
|
t
|
close_tags
.
each
do
|
tag
|
if
tag
[
:id
]
==
t
.
id
rank_sum
+=
tag
[
:rank
]
end
end
end
wdf
=
0
wdf
=
(
Math
.
log2
(
rank_sum
)
/
(
Math
.
log2
(
lo
.
tags
.
size
)
+
1
))
if
lo
.
tags
.
size
!=
0
itf
=
Math
.
log2
(
Tag
.
all
.
size
/
freq_cluster
)
+
1
return
wdf
*
itf
end
# returns the rank sum of the tags in the LO
def
relevance_raw_rank
(
lo
,
close_tags
)
# for each tag in LO that is in the cluster, accumulate it's rank
rank_sum
=
0
lo
.
tags
.
each
do
|
t
|
close_tags
.
each
do
|
tag
|
if
tag
[
:id
]
==
t
.
id
rank_sum
+=
tag
[
:rank
]
end
end
end
return
rank_sum
end
def
parseTree
(
path
)
# parse .ftree into a 'tree', leafs are tags with flow number, branches are the clustes
# create tags list, tags[tagId] == the tag leaf inside the cluster tree
puts
"
\n
Parsing .ftree output into a 'tree'
\n
"
clusters
=
{
childs:
[],
parent:
nil
}
tags
=
{}
countClusters
=
{}
File
.
open
(
path
+
".ftree"
,
"r"
)
do
|
f
|
f
.
gets
f
.
gets
while
line
=
f
.
gets
break
if
!
line
.
include?
':'
tmp
=
line
.
split
(
' '
)
# tmp[0] format like: 2:7:5:4, 2 an 7 have clusters as childs, 5 has leafs, 4 is one of them (tag)
# get id of each level of the tree
ftree
=
tmp
[
0
].
split
(
':'
)[
0
..-
2
]
# last number of the sequence is the leaf Id
leafId
=
tmp
[
0
].
split
(
':'
)[
-
1
].
to_i
# last number on the line is the tag Id
tagId
=
tmp
[
-
1
].
to_i
# second number on the line
flow
=
tmp
[
1
].
to_f
# between the third and second to last is where the name lies
name
=
tmp
[
2
..-
2
].
join
(
' '
)[
1
..-
2
]
# iterate through the levels of the tree
it
=
clusters
# start at the root
ftree
.
each
do
|
clusterId
|
clusterId
=
clusterId
.
to_i
-
1
# on the file they start at 1, we want 0
if
it
[
:childs
][
clusterId
].
nil?
# if this id doesn't exist, create it as child of 'it'
it
[
:childs
][
clusterId
]
=
{
childs:
[],
parent:
nil
}
it
[
:childs
][
clusterId
][
:parent
]
=
it
end
# go down a level
it
=
it
[
:childs
][
clusterId
]
end
countClusters
[
it
]
=
1
# set this cluster in this hash, for counting purposes
# 'it' is the cluster leafId is a child of, so add it
it
[
:childs
][
leafId
-
1
]
=
{
id:
tagId
,
flow:
flow
,
name:
name
,
parent:
it
}
# put the leaf on this hash for easy acess by the tagId
tags
[
tagId
]
=
it
[
:childs
][
leafId
-
1
]
end
end
puts
"
\n
Number of clusters found:
#{
countClusters
.
size
}
\n
"
return
tags
end
end
# namespace
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment