diff --git a/app/workers/bitstream_downloader_worker.rb b/app/workers/bitstream_downloader_worker.rb new file mode 100644 index 0000000000000000000000000000000000000000..b312884605ead7e625b9b9baa338728457991a8e --- /dev/null +++ b/app/workers/bitstream_downloader_worker.rb @@ -0,0 +1,10 @@ +class BitstreamDownloaderWorker + + include Sidekiq::Worker + include Bitstream::Utils + + def perform(retrieve_link,output_file) + download_bitstream(retrieve_link,output_file) + end + +end diff --git a/lib/thumbnail/accepted_formats.rb b/lib/bitstream/accepted_formats.rb similarity index 98% rename from lib/thumbnail/accepted_formats.rb rename to lib/bitstream/accepted_formats.rb index bec7f427de5d0aea0ceba8a80ba69c471a4ab230..8b8088d0e86e1701fc3da15a59d151f3cc23f822 100644 --- a/lib/thumbnail/accepted_formats.rb +++ b/lib/bitstream/accepted_formats.rb @@ -1,4 +1,4 @@ -module Thumbnail +module Bitstream module AcceptedFormats def get_file_basename file diff --git a/lib/bitstream/utils.rb b/lib/bitstream/utils.rb new file mode 100644 index 0000000000000000000000000000000000000000..a01ff78e697a00e68d03612c99896c452d82651c --- /dev/null +++ b/lib/bitstream/utils.rb @@ -0,0 +1,122 @@ +module Bitstream + module Utils + + include AcceptedFormats + + def clean_tmpfiles(workdir,files) + files.each do |f| + delete_file f + end + delete_dir workdir + end + + def tmpdir_path(id) + tmpdir = "/tmp/#{id}/" + create_dir tmpdir + return tmpdir + end + + def create_dir(dirname) + FileUtils.mkdir_p(dirname) unless File.directory?(dirname) + end + + def delete_dir(dir) + FileUtils.rmdir(dir) if File.directory?(dir) + end + + def delete_file(file) + FileUtils.rm(file) if File.exist?(file) + end + + def public_dir + @public_dir ||= Rails.root.join('public') + end + + def get_accepted_bitstreams(filename,retrieve_link,workdir) + + file_format = get_file_extname filename + + accepted_files = [] + + if accepted_formats.include? file_format or accepted_archive_formats.include? file_format + begin + file = workdir + filename + download_bitstream(retrieve_link,file) + rescue Exception => e + puts "#{method_log_tag} #{e}" + else + if accepted_archive_formats.include? file_format + extract_accepted_bitstreams(file,workdir).each do |f| + accepted_files << f + end + delete_file file + else + accepted_files << file + end + end + end + + return accepted_files + end + + def extract_accepted_bitstreams(archive_file,workdir) + + return [] if archive_file.nil? + + extracted_files = [] + + puts "#{method_log_tag} Extracting accepted files out from '#{get_file_basename(archive_file)}'" + begin + Archive.read_open_filename(archive_file) do |archive| + while entry = archive.next_header + if accepted_formats.include? get_file_extname(entry.pathname) + filename = get_file_basename(entry.pathname) + file = workdir + filename + begin + File.open(file, 'wb') do |f| + archive.read_data(1024) do |d| + f << d + end + end + rescue Exception => e + puts "#{method_log_tag} #{e} + \r#{method_log_tag} ERROR: Some error occurred while extracting file '#{filename}'" + else + puts "#{method_log_tag} SUCCESS: Extracted '#{filename}' to '#{file}'" + extracted_files << file + end + end + end + end + rescue Exception => e + puts "#{method_log_tag} #{e} + \r#{method_log_tag} ERROR: Some error occurred while extracting files from '#{get_file_basename(archive_file)}'" + end + + return extracted_files + end + + def download_bitstream(url,output) + puts "#{method_log_tag} Downloading bitstream: '#{url}' => '#{output}'" + begin + c = Curl::Easy.new(url) + c.ssl_verify_peer = false + c.ssl_verify_host = false + File.open(output, 'wb') do |f| + c.on_body {|data| f << data; data.size } + c.perform + end + rescue Exception => e + raise "#{method_log_tag} #{e} + \r#{method_log_tag} ERROR: Some error occurred during file download." + else + return true + end + end + + def method_log_tag + return "[#{caller_locations(1,1)[0].label}]" + end + + end +end diff --git a/lib/tasks/bitstream.rake b/lib/tasks/bitstream.rake new file mode 100644 index 0000000000000000000000000000000000000000..fda4d8af99d5b36d70f83aac8a2439e6a648cb04 --- /dev/null +++ b/lib/tasks/bitstream.rake @@ -0,0 +1,49 @@ +namespace :bitstream do + + desc "Download Bitstreams" + task :download_streams => :environment do + + include RepositoriesProxy + include Bitstream::AcceptedFormats + include Bitstream::Utils + + # Quantity of items fetched on each iteration + limit = 1000 + # Start point from where items will be fetched + offset = 0 + + loop do + + begin + # Get items from dspace (from offset to offset+limit) + items = learning_object_repository.all_from_offset_to_limit(offset,limit) + rescue + # Sleeps for a while to wait database's recovery + sleep(30.seconds) + # Goes to next iteration to retry + next + else + # Terminate loop if there are no more items to import + break if items.empty? + + output_dir = "#{public_dir}/bitstreams/" + create_dir output_dir + + items.each do |item| + filename = item.get_filename + file_format = get_file_extname filename + if accepted_video_formats.include? file_format + file_basename = get_file_basename filename + retrieve_link = item.get_retrievelink + output_file = output_dir + file_basename + BitstreamDownloaderWorker.perform_async(retrieve_link,output_file) + end + end + + # Increment offset, to get new items on next iteration + offset += limit + end + end + end + +end diff --git a/lib/thumbnail/generate.rb b/lib/thumbnail/generate.rb index 0afefa4348a41e0531d604ffc43b1dec41e0745d..ad25a3a4c0faeb1e421c28f04e11a1eadef8a05f 100644 --- a/lib/thumbnail/generate.rb +++ b/lib/thumbnail/generate.rb @@ -1,8 +1,9 @@ module Thumbnail module Generate - include AcceptedFormats include RepositoriesProxy + include Bitstream::Utils + include Bitstream::AcceptedFormats def generate_thumbnail(learning_object_id) @@ -17,9 +18,9 @@ module Thumbnail workdir = tmpdir_path(item_uniq_hash) thumbnail = thumbnail_path(item_uniq_hash, size) - output = "#{root_dir}#{thumbnail}" + output = "#{public_dir}#{thumbnail}" - accepted_files = get_accepted_files(@item.get_filename,@item.get_retrievelink,workdir) + accepted_files = get_accepted_bitstreams(@item.get_filename,@item.get_retrievelink,workdir) accepted_files.sort! { |x,y| File.size(y) <=> File.size(x) } accepted_files.each do |input| filename = get_file_basename input @@ -51,7 +52,7 @@ module Thumbnail end def default_thumbnail - @default_thumbnail ||= nil + @default_thumbnail ||= '' end private @@ -95,41 +96,12 @@ module Thumbnail Digest::SHA1.hexdigest object end - def create_dir(dirname) - FileUtils.mkdir_p(dirname) unless File.directory?(dirname) - end - - def tmpdir_path(id) - tmpdir = "/tmp/#{id}/" - create_dir tmpdir - return tmpdir - end - def thumbnail_path(id, size) thumbnails_dir = "/thumbnails" - create_dir "#{root_dir}#{thumbnails_dir}" + create_dir "#{public_dir}#{thumbnails_dir}" return "#{thumbnails_dir}/#{id}_#{size}.jpg" end - def root_dir - @root_dir ||= Rails.root.join('public') - end - - def clean_tmpfiles(workdir,files) - files.each do |f| - delete_file f - end - delete_dir workdir - end - - def delete_dir(dir) - FileUtils.rmdir(dir) if File.directory?(dir) - end - - def delete_file(file) - FileUtils.rm(file) if File.exist?(file) - end - def item_id @item_id ||= @item.id end @@ -138,86 +110,5 @@ module Thumbnail return "[#{caller_locations(1,1)[0].label} (#{item_id})]" end - def get_accepted_files(filename,retrieve_link,workdir) - - file_format = get_file_extname filename - - accepted_files = [] - - if accepted_formats.include? file_format or accepted_archive_formats.include? file_format - begin - file = workdir + filename - download_bitstream(retrieve_link,file) - rescue Exception => e - puts "#{method_log_tag} #{e}" - else - if accepted_archive_formats.include? file_format - extract_accepted_files(file,workdir).each do |f| - accepted_files << f - end - delete_file file - else - accepted_files << file - end - end - end - - return accepted_files - end - - def extract_accepted_files(archive_file,workdir) - - return [] if archive_file.nil? - - extracted_files = [] - - puts "#{method_log_tag} Extracting accepted files out from '#{get_file_basename(archive_file)}'" - begin - Archive.read_open_filename(archive_file) do |archive| - while entry = archive.next_header - if accepted_formats.include? get_file_extname(entry.pathname) - filename = get_file_basename(entry.pathname) - file = workdir + filename - begin - File.open(file, 'wb') do |f| - archive.read_data(1024) do |d| - f << d - end - end - rescue Exception => e - puts "#{method_log_tag} #{e} - \r#{method_log_tag} ERROR: Some error occurred while extracting file '#{filename}'" - else - puts "#{method_log_tag} SUCCESS: Extracted '#{filename}' to '#{file}'" - extracted_files << file - end - end - end - end - rescue Exception => e - puts "#{method_log_tag} #{e} - \r#{method_log_tag} ERROR: Some error occurred while extracting files from '#{get_file_basename(archive_file)}'" - end - - return extracted_files - end - - def download_bitstream(url,output) - begin - c = Curl::Easy.new(url) - c.ssl_verify_peer = false - c.ssl_verify_host = false - File.open(output, 'wb') do |f| - c.on_body {|data| f << data; data.size } - c.perform - end - rescue Exception => e - raise "#{method_log_tag} #{e} - \r#{method_log_tag} ERROR: Some error occurred during file download." - else - return true - end - end - end end diff --git a/start_sidekiq_workers.sh b/start_sidekiq_workers.sh new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391