From 0f634a0623992fc6b4422abc8732dfec745d5180 Mon Sep 17 00:00:00 2001 From: Cristian Weiland <cw14@inf.ufpr.br> Date: Fri, 17 Mar 2017 17:09:17 -0300 Subject: [PATCH] Issue #25: Fix more bugs and allow to insert multiple universities Signed-off-by: Cristian Weiland <cw14@inf.ufpr.br> --- scripts/expenses/config.sh | 13 ++--- scripts/expenses/create_expenses_config.py | 9 ++-- scripts/expenses/insert_expenses.sh | 25 +++++---- scripts/expenses/logstash_config.example | 2 +- scripts/expenses/resume_expenses.sh | 3 +- scripts/expenses/tmp.sh | 11 ++++ scripts/travel_allowances/config.sh | 13 ++--- .../create_travel_allowance_config.py | 9 ++-- .../insert_travel_allowances.sh | 25 +++++---- .../travel_allowances/logstash_config.example | 2 +- .../resume_travel_allowance.sh | 3 +- scripts/workers/config.json.example | 2 +- scripts/workers/config.sh | 13 ++--- scripts/workers/create_config.py | 11 ++-- scripts/workers/insert_register_payment.sh | 42 +++++++++------ scripts/workers/logstash_config.example | 4 +- scripts/workers/logstash_config_2013.example | 2 +- scripts/workers/merge_files_es.py | 51 ++++++------------- scripts/workers/resume_register.sh | 6 ++- 19 files changed, 127 insertions(+), 119 deletions(-) create mode 100755 scripts/expenses/tmp.sh diff --git a/scripts/expenses/config.sh b/scripts/expenses/config.sh index 9700889..5c22af6 100644 --- a/scripts/expenses/config.sh +++ b/scripts/expenses/config.sh @@ -4,15 +4,12 @@ index="gastos-pagamentos" -# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities. -# Ex: Getting only UFPR: -# filter="UNIVERSIDADE FEDERAL DO PARANA" -# Getting UFPR and UFMG: -# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS" -# Getting all universities: -# filter="UNIVERSIDADE FEDERAL*" +# Filter: An array of strings that will be used on 'egrep' to filter data to get only relevant universities. +# University: An array of initials, corresponding to Filter. +# Warning: Filter's length must be the same as university's!! -filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA" +filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA") +university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm") # Host: ElasticSearch's host. Ex: "localhost" diff --git a/scripts/expenses/create_expenses_config.py b/scripts/expenses/create_expenses_config.py index 0c86e6a..be852fb 100755 --- a/scripts/expenses/create_expenses_config.py +++ b/scripts/expenses/create_expenses_config.py @@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 8: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>") +if len(sys.argv) != 9: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>") sys.exit() with open('logstash_config.example') as infile: @@ -21,8 +21,9 @@ output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.ar , "date": sys.argv[1] + '-' + sys.argv[2] , "index": sys.argv[4] , "host": sys.argv[5] - , "user": sys.argv[6] - , "password": sys.argv[7] } + , "university": sys.argv[6] + , "user": sys.argv[7] + , "password": sys.argv[8] } with open('../../configs/expenses/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: outfile.write(output) diff --git a/scripts/expenses/insert_expenses.sh b/scripts/expenses/insert_expenses.sh index 877b683..3ec22d5 100755 --- a/scripts/expenses/insert_expenses.sh +++ b/scripts/expenses/insert_expenses.sh @@ -46,20 +46,25 @@ request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m=' curl -o $path$ym/${1}${2}_GastosDiretos.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed # Unzip them -unzip $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/ +unzip -o $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/ # Remove zip file rm $path$ym/${1}${2}_GastosDiretos.zip source ./config.sh -echo $filter +length=${#filter[@]} -# Step 2: -./create_expenses_config.py $1 $2 $day $index $host $3 $4 -# Step 3: -./resume_expenses.sh ../../data/expenses/ ${1}-${2} "$filter" -# Step 4: -logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv -# Data inserted, we can now remove it. -rm ../../data/expenses/processed/${1}${2}.csv +for (( i=0; i<${length}; i++ )); +do + # Step 2: + ./create_expenses_config.py $1 $2 "$day" "$index" "$host" "${university[$i]}" $3 $4 + # Step 3: + ./resume_expenses.sh "${path}" ${1}-${2} "${filter[$i]}" + # Step 4: + logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv + # Data inserted, we can now remove it. + rm ../../data/expenses/processed/${1}${2}.csv +done + +rm $path${1}-${2}/${1}${2}_GastosDiretos.csv diff --git a/scripts/expenses/logstash_config.example b/scripts/expenses/logstash_config.example index b1f680f..249d217 100644 --- a/scripts/expenses/logstash_config.example +++ b/scripts/expenses/logstash_config.example @@ -42,7 +42,7 @@ output { user => "%(user)s" password => "%(password)s" hosts => "http://%(host)s:9200" - index => "ufpr-%(index)s-%(date)s" + index => "%(index)s-%(university)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/expenses/resume_expenses.sh b/scripts/expenses/resume_expenses.sh index a389304..38bd5d9 100755 --- a/scripts/expenses/resume_expenses.sh +++ b/scripts/expenses/resume_expenses.sh @@ -32,5 +32,4 @@ fi # - Grep removes everyone that does not work in UFPR. # - Tr removes null characters (ctrl + @). -cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output -rm $input +cat "$input" | egrep --binary-files=text "$filter" | tr -d '\000' > "$output" diff --git a/scripts/expenses/tmp.sh b/scripts/expenses/tmp.sh new file mode 100755 index 0000000..38f153c --- /dev/null +++ b/scripts/expenses/tmp.sh @@ -0,0 +1,11 @@ +filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA") +university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm") + +length=${#filter[@]} + +for (( i=0; i<${length}; i++ )); +do + echo $i + echo ${filter[$i]} + echo ${university[$i]} +done diff --git a/scripts/travel_allowances/config.sh b/scripts/travel_allowances/config.sh index a1897f5..3cf088a 100644 --- a/scripts/travel_allowances/config.sh +++ b/scripts/travel_allowances/config.sh @@ -4,15 +4,12 @@ index="gastos-diarias" -# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities. -# Ex: Getting only UFPR: -# filter="UNIVERSIDADE FEDERAL DO PARANA" -# Getting UFPR and UFMG: -# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS" -# Getting all universities: -# filter="UNIVERSIDADE FEDERAL*" +# Filter: An array of strings that will be used on 'egrep' to filter data to get only relevant universities. +# University: An array of initials, corresponding to Filter. +# Warning: Filter's length must be the same as university's!! -filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA" +filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA") +university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm") # Host: ElasticSearch's host. Examples: "localhost" diff --git a/scripts/travel_allowances/create_travel_allowance_config.py b/scripts/travel_allowances/create_travel_allowance_config.py index 5743448..11f340f 100755 --- a/scripts/travel_allowances/create_travel_allowance_config.py +++ b/scripts/travel_allowances/create_travel_allowance_config.py @@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 8: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>") +if len(sys.argv) != 9: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>") sys.exit() with open('logstash_config.example') as infile: @@ -21,8 +21,9 @@ output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.ar , "date": sys.argv[1] + '-' + sys.argv[2] , "index": sys.argv[4] , "host": sys.argv[5] - , "user": sys.argv[6] - , "password": sys.argv[7] } + , "university": sys.argv[6] + , "user": sys.argv[7] + , "password": sys.argv[8] } with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: outfile.write(output) diff --git a/scripts/travel_allowances/insert_travel_allowances.sh b/scripts/travel_allowances/insert_travel_allowances.sh index 844870e..fb2b6b3 100755 --- a/scripts/travel_allowances/insert_travel_allowances.sh +++ b/scripts/travel_allowances/insert_travel_allowances.sh @@ -45,20 +45,25 @@ mkdir -p $path$ym # Download files request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&consulta=Diarias' -curl -o $path$ym/${1}${2}_Diarias.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed +curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Diarias.zip # Unzip them -unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/ +unzip -o $path$ym/${1}${2}_Diarias.zip -d $path$ym/ # Remove zip file rm $path$ym/${1}${2}_Diarias.zip -# Step 2: -./create_travel_allowance_config.py $1 $2 "$day" "$index" "$host" $3 $4 -# Step 3: -./resume_travel_allowance.sh $path ${1}-${2} "$filter" -# Step 4: -logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv +length=${#filter[@]} -# Remove processed file -rm ${path}processed/${1}${2}.csv +for (( i=0; i<${length}; i++ )); +do + # Step 2: + ./create_travel_allowance_config.py $1 $2 "$day" "$index" "$host" "${university[$i]}" $3 $4 + # Step 3: + echo "${filter[$i]}" + ./resume_travel_allowance.sh "$path" ${1}-${2} "${filter[$i]}" + # Step 4: + logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv + # Remove processed file + rm ${path}processed/${1}${2}.csv +done diff --git a/scripts/travel_allowances/logstash_config.example b/scripts/travel_allowances/logstash_config.example index 20dc7e9..43f956f 100644 --- a/scripts/travel_allowances/logstash_config.example +++ b/scripts/travel_allowances/logstash_config.example @@ -42,7 +42,7 @@ output { user => "%(user)s" password => "%(password)s" hosts => "http://%(host)s:9200" - index => "ufpr-%(index)s-%(date)s" + index => "%(index)s-%(university)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/travel_allowances/resume_travel_allowance.sh b/scripts/travel_allowances/resume_travel_allowance.sh index ecac412..9e43825 100755 --- a/scripts/travel_allowances/resume_travel_allowance.sh +++ b/scripts/travel_allowances/resume_travel_allowance.sh @@ -13,6 +13,7 @@ path=$1 date=$2 # dateWithoutHyphen example: 201611 dateWithoutHyphen=${date//-} +filter=$3 if [ "$#" -ne 3 ]; then echo "Usage: $0 <path> <date> <filter>" @@ -30,4 +31,4 @@ fi # - Grep removes everyone that does not work in UFPR. # - Tr removes null characters (ctrl + @). -cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output +cat "$input" | egrep --binary-files=text "$filter" | tr -d '\000' > "$output" diff --git a/scripts/workers/config.json.example b/scripts/workers/config.json.example index 8dbce65..01a2d48 100644 --- a/scripts/workers/config.json.example +++ b/scripts/workers/config.json.example @@ -2,7 +2,7 @@ "path": "Dados_Servidores/2016-10/" , "date": "20161031" , "file1" : "_Remuneracao.csv" - , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "file2" : "_Cadastro_Unique.csv" , "idColumn1" : 2 , "idColumn2" : 0 , "quotechar": "\"" diff --git a/scripts/workers/config.sh b/scripts/workers/config.sh index b3fe041..08395d6 100644 --- a/scripts/workers/config.sh +++ b/scripts/workers/config.sh @@ -4,15 +4,12 @@ index="servidores" -# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities. -# Ex: Getting only UFPR: -# filter="UNIVERSIDADE FEDERAL DO PARANA" -# Getting UFPR and UFMG: -# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS" -# Getting all universities: -# filter="UNIVERSIDADE FEDERAL*" +# Filter: An array of strings that will be used on 'egrep' to filter data to get only relevant universities. +# University: An array of initials, corresponding to Filter. +# Warning: Filter's length must be the same as university's!! -filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA" +filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA") +university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm") # Host: ElasticSearch's host. Examples: "localhost" diff --git a/scripts/workers/create_config.py b/scripts/workers/create_config.py index 9bc0b15..5594b03 100755 --- a/scripts/workers/create_config.py +++ b/scripts/workers/create_config.py @@ -12,15 +12,15 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 8: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>") +if len(sys.argv) != 9: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>") sys.exit() data = { "path": "../../data/workers/" + sys.argv[1] + "-" + sys.argv[2] + "/" , "date": sys.argv[1] + sys.argv[2] + sys.argv[3] , "file1": "_Remuneracao.csv" - , "file2": "_Cadastro_Ufpr_Unique.csv" + , "file2": "_Cadastro_Unique.csv" , "idColumn1": 2 , "idColumn2": 0 , "quotechar": "\"" @@ -43,8 +43,9 @@ output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.ar , "date": sys.argv[1] + '-' + sys.argv[2] , "index": sys.argv[4] , "host": sys.argv[5] - , "user": sys.argv[6] - , "password": sys.argv[7] } + , "university": sys.argv[6] + , "user": sys.argv[7] + , "password": sys.argv[8] } with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: outfile.write(output) diff --git a/scripts/workers/insert_register_payment.sh b/scripts/workers/insert_register_payment.sh index cdc4ee3..7e286fd 100755 --- a/scripts/workers/insert_register_payment.sh +++ b/scripts/workers/insert_register_payment.sh @@ -43,13 +43,14 @@ fi # Step 1: # Create directory to store files mkdir -p $path$ym +mkdir -p ${path}processed/ # Download files request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&d=C&consulta=Servidores' -curl -o $path$ym/${1}${2}_Servidores.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_ 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed +curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_ 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Servidores.zip # Unzip them -unzip $path$ym/${1}${2}_Servidores.zip -d $path$ym/ +unzip -o $path$ym/${1}${2}_Servidores.zip -d $path$ym/ # Remove zip file rm $path$ym/${1}${2}_Servidores.zip @@ -57,17 +58,26 @@ rm $path$ym/${1}${2}_Servidores.zip # Get day day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8) -# Step 2: -# Create config files -./create_config.py $1 $2 "$day" "$index" "$host" $3 $4 - -# Step 3: -# Start processing -./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json "$filter" - -# Step 4: -# Insert data in ElasticSearch -logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv - -# Remove data -rm ../../data/workers/processed/${1}${2}.csv +length=${#filter[@]} + +for (( i=0; i<${length}; i++ )); +do + # Step 2: + # Create config files + ./create_config.py $1 $2 "$day" "$index" "$host" "${university[$i]}" $3 $4 + + # Step 3: + # Start processing + aux=$( echo "${filter[$i]}" | sed 's/ /\\ /g' ) + ./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json "$aux" + echo "removing..." + rm $path$ym/${1}${2}${day}_Cadastro_Unique.csv + echo "success" + + # Step 4: + # Insert data in ElasticSearch + logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv + + # Remove data + #rm ../../data/workers/processed/${1}${2}.csv +done diff --git a/scripts/workers/logstash_config.example b/scripts/workers/logstash_config.example index 0383a39..b96a760 100644 --- a/scripts/workers/logstash_config.example +++ b/scripts/workers/logstash_config.example @@ -58,8 +58,8 @@ output { action => "index" user => "%(user)s" password => "%(password)s" - hosts => "http://$(host):9200" - index => "ufpr-$(index)s-%(date)s" + hosts => "http://%(host)s:9200" + index => "%(index)s-%(university)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/workers/logstash_config_2013.example b/scripts/workers/logstash_config_2013.example index b7af5c0..7749904 100644 --- a/scripts/workers/logstash_config_2013.example +++ b/scripts/workers/logstash_config_2013.example @@ -59,7 +59,7 @@ output { user => "%(user)s" password => "%(password)s" hosts => "http://$(host)s:9200" - index => "ufpr-%(index)s-%(date)s" + index => "%(index)s-%(university)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/workers/merge_files_es.py b/scripts/workers/merge_files_es.py index 68b3175..14cfab3 100755 --- a/scripts/workers/merge_files_es.py +++ b/scripts/workers/merge_files_es.py @@ -4,18 +4,18 @@ # Script made to create a CSV that will be inserted in ElasticSearch. # This script is being used to merge two files: a Remuneration report (ex: 20161031_Remuneracao.csv) with a file that contains the Portal ID from UFPR people -# (ex: 20161031_Cadastro_Ufpr_Unique.csv). This second file can be obtained filtering a Register report (ex: 20161031_Cadastro.csv) using resume_register.sh. +# (ex: 20161031_Cadastro_Unique.csv). This second file can be obtained filtering a Register report (ex: 20161031_Cadastro.csv) using resume_register.sh. # Input: A configuration file, in the same format as the example. This configuration file can be generated by create_config.py. # Documentation of config.json.example: # - Variables ending with number 1 represent something from the first file, while the ones that end with number 2 represent the same thing in the second file. -# - File1 and File2 are the files that will be merged. File1 name is "*_Cadastro_Ufpr_Unique.csv", File2 name is "*_Remuneracao.csv". +# - File1 and File2 are the files that will be merged. File1 name is "*_Cadastro_Unique.csv", File2 name is "*_Remuneracao.csv". # - IdColumn1 and IdColumn2 represent the common column for each CSV (Ex: an ID column). # - Quotechar, Delimiter and LineTerminator are the CSV's quote char, delimiter and line terminator, respectively. # - OutputFile is the name of the output file (the result CSV). -# Output: A CSV that will contain every row from the second file (*_Cadastro_Ufpr_Unique.csv). From the first file (*_Remuneracao.csv), +# Output: A CSV that will contain every row from the second file (*_Cadastro_Unique.csv). From the first file (*_Remuneracao.csv), # I will get only data thats in the second file as well. This means some people in our data will not have data from Remuneracao.csv. import sys, csv, json, math, subprocess @@ -78,51 +78,32 @@ def getDataWithEmptyRow(columns, row): return newRow result = [] -count = 0 hits = 0 errors = 0 -previous = 0 -progress = 0 -const = 100 / len(csv_2) - -print("Preparing data...") # Get number of columns in file 1 columns1 = len(csv_1[0]) -# Separate id_point from useless data in file 2 and append points in result array. +# Create dictionary... +data = {} +for row in csv_1: + data[row[idPointColumn1]] = row for row2 in csv_2: - count += 1 - if(count % 10) == 0: - previous = progress - progress = math.floor(count * const) - if(progress != previous): - print(str(progress) + '% completed.') - #print(count) - # I have IdPoint. Find the correspondent one in the other csv - # and add data from file 2 to file 2. - found = False - for row1 in csv_1: - if row1[idPointColumn1] == row2[idPointColumn2]: - newRow = getDataFromRows(row1, row2) - # To make sure we wont get the same point twice. - row1[idPointColumn1] = -1; - row2[idPointColumn2] = -1; - result.append(newRow) - found = True - hits += 1 - break - if not found: + if row2[idPointColumn2] in data: + newRow = getDataFromRows(data[row2[idPointColumn2]], row2) + # To make sure we wont get the same point twice. + del data[row2[idPointColumn2]] + row2[idPointColumn2] = -1; + result.append(newRow) + hits += 1 + else: # This guy was in the second file, but not in the first one. Add him, but with null values in the second file. newRow = getDataWithEmptyRow(columns1, row2) result.append(newRow) errors += 1 -count = 0 -const = 50 / len(csv_1) - -print("Number of rows in file 2 but not in file 1: " + str(errors)) +#print("Number of rows in file 2 but not in file 1: " + str(errors)) print("Saving data...") with open(params['outputFile'], 'w', newline='') as csvfile: diff --git a/scripts/workers/resume_register.sh b/scripts/workers/resume_register.sh index ea36653..897055c 100755 --- a/scripts/workers/resume_register.sh +++ b/scripts/workers/resume_register.sh @@ -20,7 +20,7 @@ date=$2 filter=$3 input="${path}${date}_Cadastro.csv" -output="${path}${date}_Cadastro_Ufpr_Unique.csv" +output="${path}${date}_Cadastro_Unique.csv" if [ ! -d "${path}" ]; then mkdir -p "${path}" @@ -35,5 +35,7 @@ fi # Get data from all universities. # cat $input | egrep --binary-files=text "(UNIVERSIDADE FED*)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output +echo $filter + # Get only data from UFPR. -cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output +cat "$input" | egrep --binary-files=text "$filter" | tr -d '\000' > "$output" -- GitLab