From 5f970c9e58751aa327a75eed632d1ef9b5c0a0e5 Mon Sep 17 00:00:00 2001 From: Cristian Weiland <cw14@inf.ufpr.br> Date: Thu, 16 Mar 2017 11:32:27 -0300 Subject: [PATCH] Issue #25: Fix bug getting last day of month in script and add config file The config file has the following variables: - Host: Host running ElasticSearch. Must not contain "http://". - Index: The name of the index in which data will be inserted. - Filter: The filter to get only part of Portal Transparencia's data. Scripts were getting always the last day of the current month instead of the last day of the parameter's month. Signed-off-by: Cristian Weiland <cw14@inf.ufpr.br> --- scripts/expenses/config.sh | 19 +++++++++++++++++++ scripts/expenses/create_expenses_config.py | 10 ++++++---- scripts/expenses/insert_expenses.sh | 10 +++++++--- scripts/expenses/logstash_config.example | 4 ++-- scripts/expenses/resume_expenses.sh | 17 +++++++++-------- scripts/travel_allowances/config.sh | 19 +++++++++++++++++++ .../create_travel_allowance_config.py | 10 ++++++---- .../insert_travel_allowances.sh | 11 ++++++++--- .../travel_allowances/logstash_config.example | 4 ++-- .../resume_travel_allowance.sh | 8 +++----- scripts/workers/config.sh | 19 +++++++++++++++++++ scripts/workers/create_config.py | 10 ++++++---- scripts/workers/insert_register_payment.sh | 10 +++++++--- scripts/workers/logstash_config.example | 4 ++-- scripts/workers/logstash_config_2013.example | 4 ++-- scripts/workers/merge_files_es.py | 6 +++--- scripts/workers/resume_register.sh | 13 +++++++------ 17 files changed, 127 insertions(+), 51 deletions(-) create mode 100644 scripts/expenses/config.sh create mode 100644 scripts/travel_allowances/config.sh create mode 100644 scripts/workers/config.sh diff --git a/scripts/expenses/config.sh b/scripts/expenses/config.sh new file mode 100644 index 0000000..9700889 --- /dev/null +++ b/scripts/expenses/config.sh @@ -0,0 +1,19 @@ +# This file only contains some config variables: + +# Index prefix: The prefix of the index in elasticsearch. Ex: gastos + +index="gastos-pagamentos" + +# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities. +# Ex: Getting only UFPR: +# filter="UNIVERSIDADE FEDERAL DO PARANA" +# Getting UFPR and UFMG: +# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS" +# Getting all universities: +# filter="UNIVERSIDADE FEDERAL*" + +filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA" + +# Host: ElasticSearch's host. Ex: "localhost" + +host="localhost" diff --git a/scripts/expenses/create_expenses_config.py b/scripts/expenses/create_expenses_config.py index a40f907..0c86e6a 100755 --- a/scripts/expenses/create_expenses_config.py +++ b/scripts/expenses/create_expenses_config.py @@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 6: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>") +if len(sys.argv) != 8: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>") sys.exit() with open('logstash_config.example') as infile: @@ -19,8 +19,10 @@ with open('logstash_config.example') as infile: output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' , "date": sys.argv[1] + '-' + sys.argv[2] - , "user": sys.argv[4] - , "password": sys.argv[5] } + , "index": sys.argv[4] + , "host": sys.argv[5] + , "user": sys.argv[6] + , "password": sys.argv[7] } with open('../../configs/expenses/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: outfile.write(output) diff --git a/scripts/expenses/insert_expenses.sh b/scripts/expenses/insert_expenses.sh index a05e7a7..1878082 100755 --- a/scripts/expenses/insert_expenses.sh +++ b/scripts/expenses/insert_expenses.sh @@ -19,7 +19,7 @@ fi # Getting the Last day of this month (Using date 2016-05-15 as example): # First, get next month (201606). -aux=$(date +%Y%m -d "$(date +%Y%m15) next month") +aux=$(date +%Y%m -d "$(date +${1}${2}15) next month") # Append day 01 (20160601). temp=$(date -d "${aux}01") # Remove 1 day: 20160531, get only day: 31. @@ -50,9 +50,13 @@ unzip $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/ # Remove zip file rm $path$ym/${1}${2}_GastosDiretos.zip +source config.sh + # Step 2: -./create_expenses_config.py $1 $2 $day $3 $4 +./create_expenses_config.py $1 $2 $day $index $host $3 $4 # Step 3: -./resume_expenses.sh ../../data/expenses/ ${1}-${2} +./resume_expenses.sh ../../data/expenses/ ${1}-${2} $filter # Step 4: logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv +# Data inserted, we can now remove it. +rm ../../data/expenses/processed/${1}${2}.csv diff --git a/scripts/expenses/logstash_config.example b/scripts/expenses/logstash_config.example index c5b0ee6..b1f680f 100644 --- a/scripts/expenses/logstash_config.example +++ b/scripts/expenses/logstash_config.example @@ -41,8 +41,8 @@ output { action => "index" user => "%(user)s" password => "%(password)s" - hosts => "localhost:9200" - index => "ufpr-gastos-pagamentos-%(date)s" + hosts => "http://%(host)s:9200" + index => "ufpr-%(index)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/expenses/resume_expenses.sh b/scripts/expenses/resume_expenses.sh index 7ffe50f..a0edea1 100755 --- a/scripts/expenses/resume_expenses.sh +++ b/scripts/expenses/resume_expenses.sh @@ -7,18 +7,20 @@ # Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR). +if [ "$#" -ne 3 ]; then + echo "Usage: $0 <path> <date> <filter>" + exit +fi + # Path example: ../../data/expenses/ path=$1 # Date example: 2016-11 date=$2 +# Filter example: UNIVERSIDADE FEDERAL DO PARANA +filter=$3 # dateWithoutHyphen example: 201611 dateWithoutHyphen=${date//-} -if [ "$#" -ne 2 ]; then - echo "Usage: $0 <path> <date>" - exit -fi - echo "Processing data with args = $path and ${date}" input="${path}${date}/${dateWithoutHyphen}_GastosDiretos.csv" @@ -27,7 +29,6 @@ output="${path}processed/${dateWithoutHyphen}.csv" # About this command: # - Grep removes everyone that does not work in UFPR. # - Tr removes null characters (ctrl + @). -# - Head -n1 gets first line (column names). Then, I append the data. -head -n1 $input > $output -cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output +cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output +rm $input diff --git a/scripts/travel_allowances/config.sh b/scripts/travel_allowances/config.sh new file mode 100644 index 0000000..a1897f5 --- /dev/null +++ b/scripts/travel_allowances/config.sh @@ -0,0 +1,19 @@ +# This file only contains some config variables: + +# Index prefix: The prefix of the index in elasticsearch. Ex: gastos + +index="gastos-diarias" + +# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities. +# Ex: Getting only UFPR: +# filter="UNIVERSIDADE FEDERAL DO PARANA" +# Getting UFPR and UFMG: +# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS" +# Getting all universities: +# filter="UNIVERSIDADE FEDERAL*" + +filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA" + +# Host: ElasticSearch's host. Examples: "localhost" + +host="localhost" diff --git a/scripts/travel_allowances/create_travel_allowance_config.py b/scripts/travel_allowances/create_travel_allowance_config.py index 10e00d0..5743448 100755 --- a/scripts/travel_allowances/create_travel_allowance_config.py +++ b/scripts/travel_allowances/create_travel_allowance_config.py @@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 6: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>") +if len(sys.argv) != 8: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>") sys.exit() with open('logstash_config.example') as infile: @@ -19,8 +19,10 @@ with open('logstash_config.example') as infile: output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' , "date": sys.argv[1] + '-' + sys.argv[2] - , "user": sys.argv[4] - , "password": sys.argv[5] } + , "index": sys.argv[4] + , "host": sys.argv[5] + , "user": sys.argv[6] + , "password": sys.argv[7] } with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: outfile.write(output) diff --git a/scripts/travel_allowances/insert_travel_allowances.sh b/scripts/travel_allowances/insert_travel_allowances.sh index d850b38..1fb48b6 100755 --- a/scripts/travel_allowances/insert_travel_allowances.sh +++ b/scripts/travel_allowances/insert_travel_allowances.sh @@ -19,7 +19,7 @@ fi # Getting the Last day of this month (Using date 2016-05-15 as example): # First, get next month (201606). -aux=$(date +%Y%m -d "$(date +%Y%m15) next month") +aux=$(date +%Y%m -d "$(date +${1}${2}15) next month") # Append day 01 (20160601). temp=$(date -d "${aux}01") # Remove 1 day: 20160531, get only day: 31. @@ -30,6 +30,8 @@ dataPath="../../data/" path="../../data/travel_allowance/" configPath="../../configs/travel_allowance/logstash/" +source config.sh + if [ ! -d "$dataPath" ]; then mkdir "$dataPath" fi @@ -55,8 +57,11 @@ unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/ rm $path$ym/${1}${2}_Diarias.zip # Step 2: -./create_travel_allowance_config.py $1 $2 $day $3 $4 +./create_travel_allowance_config.py $1 $2 $day $index $host $3 $4 # Step 3: -./resume_travel_allowance.sh $path ${1}-${2} +./resume_travel_allowance.sh $path ${1}-${2} $filter # Step 4: logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv + +# Remove processed file +rm ${path}processed/${1}${2}.csv diff --git a/scripts/travel_allowances/logstash_config.example b/scripts/travel_allowances/logstash_config.example index 50afa9d..20dc7e9 100644 --- a/scripts/travel_allowances/logstash_config.example +++ b/scripts/travel_allowances/logstash_config.example @@ -41,8 +41,8 @@ output { action => "index" user => "%(user)s" password => "%(password)s" - hosts => "localhost:9200" - index => "ufpr-gastos-diarias-%(date)s" + hosts => "http://%(host)s:9200" + index => "ufpr-%(index)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/travel_allowances/resume_travel_allowance.sh b/scripts/travel_allowances/resume_travel_allowance.sh index 22c302a..a64699e 100755 --- a/scripts/travel_allowances/resume_travel_allowance.sh +++ b/scripts/travel_allowances/resume_travel_allowance.sh @@ -14,8 +14,8 @@ date=$2 # dateWithoutHyphen example: 201611 dateWithoutHyphen=${date//-} -if [ "$#" -ne 2 ]; then - echo "Usage: $0 <path> <date>" +if [ "$#" -ne 3 ]; then + echo "Usage: $0 <path> <date> <filter>" exit fi @@ -27,7 +27,5 @@ output="${path}processed/${dateWithoutHyphen}.csv" # About this command: # - Grep removes everyone that does not work in UFPR. # - Tr removes null characters (ctrl + @). -# - Head -n1 gets first line (column names). Then, I append the data. -head -n1 $input > $output -cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output +cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output diff --git a/scripts/workers/config.sh b/scripts/workers/config.sh new file mode 100644 index 0000000..b3fe041 --- /dev/null +++ b/scripts/workers/config.sh @@ -0,0 +1,19 @@ +# This file only contains some config variables: + +# Index prefix: The prefix of the index in elasticsearch. Ex: gastos + +index="servidores" + +# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities. +# Ex: Getting only UFPR: +# filter="UNIVERSIDADE FEDERAL DO PARANA" +# Getting UFPR and UFMG: +# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS" +# Getting all universities: +# filter="UNIVERSIDADE FEDERAL*" + +filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA" + +# Host: ElasticSearch's host. Examples: "localhost" + +host="localhost" diff --git a/scripts/workers/create_config.py b/scripts/workers/create_config.py index 5e5f74f..9bc0b15 100755 --- a/scripts/workers/create_config.py +++ b/scripts/workers/create_config.py @@ -12,8 +12,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 6: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>") +if len(sys.argv) != 8: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>") sys.exit() data = { @@ -41,8 +41,10 @@ else: output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' , "date": sys.argv[1] + '-' + sys.argv[2] - , "user": sys.argv[4] - , "password": sys.argv[5] } + , "index": sys.argv[4] + , "host": sys.argv[5] + , "user": sys.argv[6] + , "password": sys.argv[7] } with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: outfile.write(output) diff --git a/scripts/workers/insert_register_payment.sh b/scripts/workers/insert_register_payment.sh index 3b06efd..63fba59 100755 --- a/scripts/workers/insert_register_payment.sh +++ b/scripts/workers/insert_register_payment.sh @@ -26,6 +26,8 @@ ym=$1-$2 dataPath="../../data/" path="../../data/workers/" +source config.sh + # Check if Data and Workers directories already exist: if [ ! -d "$dataPath" ]; then mkdir "$dataPath" @@ -51,15 +53,17 @@ rm $path$ym/${1}${2}_Servidores.zip # Get day day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8) - # Step 2: # Create config files -./create_config.py $1 $2 $day $3 $4 +./create_config.py $1 $2 $day $index $host $3 $4 # Step 3: # Start processing -./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json +./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json $filter # Step 4: # Insert data in ElasticSearch logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv + +# Remove data +rm ../../data/workers/processed/${1}${2}.csv diff --git a/scripts/workers/logstash_config.example b/scripts/workers/logstash_config.example index 568ac5a..0383a39 100644 --- a/scripts/workers/logstash_config.example +++ b/scripts/workers/logstash_config.example @@ -58,8 +58,8 @@ output { action => "index" user => "%(user)s" password => "%(password)s" - hosts => "http://node1.c3sl.ufpr.br:9200" - index => "ufpr-servidores-%(date)s" + hosts => "http://$(host):9200" + index => "ufpr-$(index)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/workers/logstash_config_2013.example b/scripts/workers/logstash_config_2013.example index f8c9031..b7af5c0 100644 --- a/scripts/workers/logstash_config_2013.example +++ b/scripts/workers/logstash_config_2013.example @@ -58,8 +58,8 @@ output { action => "index" user => "%(user)s" password => "%(password)s" - hosts => "http://node1.c3sl.ufpr.br:9200" - index => "ufpr-servidores-%(date)s" + hosts => "http://$(host)s:9200" + index => "ufpr-%(index)s-%(date)s" workers => 1 } stdout {} diff --git a/scripts/workers/merge_files_es.py b/scripts/workers/merge_files_es.py index 6dccc24..68b3175 100755 --- a/scripts/workers/merge_files_es.py +++ b/scripts/workers/merge_files_es.py @@ -22,8 +22,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 2: - print("Usage: " + sys.argv[0] + " <config.json>") +if len(sys.argv) != 3: + print("Usage: " + sys.argv[0] + " <config.json> <filter>") sys.exit() with open(sys.argv[1]) as f: @@ -48,7 +48,7 @@ title1 = csv_1.pop(0) file_exists = Path(file2) if not file_exists.is_file(): print("File2 does not exist. Calling script resume_register to create it...") - call(["./resume_register.sh " + params['path'] + " " + params['date']], shell=True) + call(["./resume_register.sh " + params['path'] + " " + params['date'] + " " + sys.argv[2]], shell=True) with open(file2, newline='', encoding='Windows-1252') as f: csv_2 = [ i for i in csv.reader(f, 'dialect') ] diff --git a/scripts/workers/resume_register.sh b/scripts/workers/resume_register.sh index eb5cf9d..71f132c 100755 --- a/scripts/workers/resume_register.sh +++ b/scripts/workers/resume_register.sh @@ -10,14 +10,15 @@ # Output: CSV file named YearMonthDay_Cadastro_Ufpr_Unique.csv, in the $path folder. # Example of CSV location (using same parameters as input): ../../data/workers/2016-10/20161031_Cadastro_Ufpr_Unique.csv -path=$1 -date=$2 - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 <path> <date>" +if [ "$#" -ne 3 ]; then + echo "Usage: $0 <path> <date> <filter>" exit fi +path=$1 +date=$2 +filter=$3 + echo "Processing data with args = ${path} and ${date}" input="${path}${date}_Cadastro.csv" @@ -36,4 +37,4 @@ columns="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27 # cat $input | egrep --binary-files=text "(UNIVERSIDADE FED*|Id_SERVIDOR_PORTAL NOME)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output # Get only data from UFPR. -cat $input | egrep --binary-files=text "(UNIVERSIDADE FEDERAL DO PARANA|Id_SERVIDOR_PORTAL NOME)" | tr -d '\000' > $output +cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output -- GitLab