From 23dc88bab4209b8ae2d673f62c1111fc8bfca752 Mon Sep 17 00:00:00 2001
From: Bruno Nocera Zanette <bnzanette@inf.ufpr.br>
Date: Sat, 10 Oct 2015 17:31:19 -0300
Subject: [PATCH] Add DuplicatedItems verification task

---
 lib/tasks/dbInfo.rake | 74 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/lib/tasks/dbInfo.rake b/lib/tasks/dbInfo.rake
index 19e00c350..7eaaaee1e 100644
--- a/lib/tasks/dbInfo.rake
+++ b/lib/tasks/dbInfo.rake
@@ -43,7 +43,7 @@ namespace :dbinfo do
     end
 
     bitstreams_formats = bitstreams_formats_hash.sort_by {|key, value| value}
-    
+
     puts "\n\n"
     puts "---------------------------------"
     puts "---- BITSTREAMS FORMATS LIST ----"
@@ -60,4 +60,76 @@ namespace :dbinfo do
 
   end
 
+  task verify_duplicated_items: :environment do
+    desc "Verify duplicated items"
+
+    include RepositoriesProxy
+
+    # Quantity of LearningObjects fetched on each iteration
+    limit = 1000
+    # Starting point from where LearningObjects will be fetched
+    offset = 0
+
+    lo_hash = Hash.new
+
+    loop do
+      print "\r -> Analysing LearningObjects from #{offset} to #{offset+limit}"
+
+      begin
+        # Get LearningObjects from OrientDB (from offset to offset+limit)
+        learning_objects = learning_object_repository.all_from_offset_to_limit(offset,limit)
+      rescue
+        # Sleeps for a while to wait database's recovery
+        sleep(30.seconds)
+        # Goes to next iteration to retry
+        next
+      else
+        # Terminate loop if there are no more LearningObjects
+        break if learning_objects.empty?
+
+        learning_objects.each do |lo|
+          # Verify duplicity using LO's unique attributes
+          # item_key = encode_hash_from learning_object_unique_attributes(lo)
+          lo_key = encode_hash_from lo.name
+          if lo_hash[lo_key].nil?
+            lo_hash[lo_key] = Array.new
+          end
+          lo_hash[lo_key] << lo.id
+        end
+
+        offset += limit
+      end
+    end
+
+    duplicated_lo = lo_hash.select {|key,value| value.length > 1}
+
+    puts "\n\n"
+    puts "---------------------------------"
+    puts "----- DUPLICATED ITEMS LIST -----"
+    puts "---------------------------------"
+    puts "TOTAL CASES: #{duplicated_lo.length}"
+    puts "---------------------------------"
+    duplicated_lo.each do |key, value|
+      puts "#{value.to_s}"
+    end
+    puts "---------------------------------"
+
+  end
+
+  private
+
+  def learning_object_unique_attributes(lo)
+    unique_attributes = Array.new
+    unique_attributes << lo.name
+    unique_attributes << lo.author
+    unique_attributes << lo.description
+    unique_attributes << lo.type
+    unique_attributes << lo.metadata
+    unique_attributes.to_s
+  end
+
+  def encode_hash_from(object)
+    Digest::SHA1.hexdigest object
+  end
+
 end
-- 
GitLab