diff --git a/scripts/expenses/config.sh b/scripts/expenses/config.sh index dc2f2d05ec51a732278ab37cd30d6b8d7de443ea..cc0a16b6103b351c7cff4591b9c43c10e5fa9eb8 100644 --- a/scripts/expenses/config.sh +++ b/scripts/expenses/config.sh @@ -2,51 +2,18 @@ # Index prefix: The prefix of the index in elasticsearch. Ex: gastos -index="mec-gastos-pagamentos" +index="despesas-pagamentos" -# Filter: An associative array that will be used on 'egrep' to filter data to get only relevant universities. -# The key must be the university initials and the value should be the university name (equal to its name in Portal Transparencia's csv!). +# ColumnId: The number of the column from the CSV that we will use to filter data. + +columnId=2 + +# Filter: An associative array that will be used to filter data. The key should be the initials, and they will be used to generate the index name. +# The value should be the same as in the CSV, since it will be used to match data. declare -A filter filter=( - [ufal]="UNIVERSIDADE FEDERAL DE ALAGOAS" - [ufba]="UNIVERSIDADE FEDERAL DA BAHIA" - [ufc]="UNIVERSIDADE FEDERAL DO CEARA" - [ufes]="UNIVERSIDADE FEDERAL DO ESPIRITO SANTO" - [ufg]="UNIVERSIDADE FEDERAL DE GOIAS" - [uff]="UNIVERSIDADE FEDERAL FLUMINENSE" - [ufjf]="UNIVERSIDADE FEDERAL DE JUIZ DE FORA" - [ufmg]="UNIVERSIDADE FEDERAL DE MINAS GERAIS" - [ufpa]="UNIVERSIDADE FEDERAL DO PARA" - [ufpb]="UNIVERSIDADE FEDERAL DA PARAIBA" - [ufpr]="UNIVERSIDADE FEDERAL DO PARANA" - [ufpe]="UNIVERSIDADE FEDERAL DE PERNAMBUCO" - [ufrn]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE" - [ufrgs]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL" - [ufrj]="UNIVERSIDADE FEDERAL DO RIO DE JANEIRO" - [ufsc]="UNIVERSIDADE FEDERAL DE SANTA CATARINA" - [ufsm]="UNIVERSIDADE FEDERAL DE SANTA MARIA" - [ufrpe]="UNIVERSIDADE FEDERAL RURAL DE PERNAMBUCO" - [ufrrj]="UNIVERSIDADE FEDERAL RURAL DO RIO DE JANEIRO" - [ufrr]="UNIVERSIDADE FEDERAL DE RORAIMA" - [ufcg]="UNIVERSIDADE FEDERAL DE CAMPINA GRANDE" - [ufra]="UNIVERSIDADE FEDERAL RURAL DA AMAZONIA" - [uftm]="UNIVERSIDADE FEDERAL DO TRIANGULO MINEIRO" - [ufvjm]="UNIVERSIDADE FED.VALES JEQUITINHONHA E MUCURI" - [utfpr]="UNIVERSIDADE TECNOLOGICA FEDERAL DO PARANA" - [unifal]="UNIVERSIDADE FEDERAL DE ALFENAS" - [unifei]="UNIVERSIDADE FEDERAL DE ITAJUBA - MG" - [unifesp]="UNIVERSIDADE FEDERAL DE SÃO PAULO" - [ufla]="UNIVERSIDADE FEDERAL DE LAVRAS" - [ufersa]="UNIVERSIDADE FEDERAL RURAL DO SEMI-ARIDO" - [unirio]="UNIVERSIDADE FEDERAL DO ESTADO RIO DE JANEIRO" - [furg]="UNIVERSIDADE FEDERAL DO RIO GRANDE - FURG" - [ufrb]="UNIVERSIDADE FEDERAL DO RECONCAVO DA BAHIA" - [uffs]="UNIVERSIDADE FEDERAL DA FRONTEIRA SUL" - [ufopa]="UNIVERSIDADE FEDERAL DO OESTE DO PARA" - [ufob]="UNIVERSIDADE FEDERAL DO OESTE DA BAHIA - UFOB" - [ufca]="UNIVERSIDADE FEDERAL DO CARIRI - UFCA" - [ufsb]="UNIVERSIDADE FEDERAL DO SUL DA BAHIA - UFESBA" + [mec]="MINISTERIO DA EDUCACAO" ) # Host: ElasticSearch's host. Ex: "localhost" diff --git a/scripts/expenses/create_expenses_config.py b/scripts/expenses/create_expenses_config.py index be852fbda4dbb055eda88afce2c510badf10ae76..2a4c105255a087e69f97aebdace21efba3a2ca5d 100755 --- a/scripts/expenses/create_expenses_config.py +++ b/scripts/expenses/create_expenses_config.py @@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 9: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>") +if len(sys.argv) != 10: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <entity> <username> <password> <path>") sys.exit() with open('logstash_config.example') as infile: @@ -19,11 +19,11 @@ with open('logstash_config.example') as infile: output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' , "date": sys.argv[1] + '-' + sys.argv[2] - , "index": sys.argv[4] + , "index": sys.argv[4] + '-' + sys.argv[6] , "host": sys.argv[5] - , "university": sys.argv[6] , "user": sys.argv[7] , "password": sys.argv[8] } -with open('../../configs/expenses/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: +date = sys.argv[1] + '-' + sys.argv[2] +with open(sys.argv[9] + '/config-' + date, 'w') as outfile: outfile.write(output) diff --git a/scripts/expenses/insert_expenses.sh b/scripts/expenses/insert_expenses.sh index 347bc4814e4cdd25e3436bb0361a87fe9110eda3..d035cb614547cd8e551e198e2ec9e0db9335d8cc 100755 --- a/scripts/expenses/insert_expenses.sh +++ b/scripts/expenses/insert_expenses.sh @@ -44,41 +44,34 @@ temp=$(date -d "${aux}01") day=$(date -d "$temp - 1 day" "+%d") ym=$1-$2 -dataPath="../../data/" -path="../../data/expenses/" -configPath="../../configs/expenses/logstash/" - -if [ ! -d "$path" ]; then - mkdir -p "$path" -fi -if [ ! -d "$configPath" ]; then - mkdir -p "$configPath" -fi +path="./tmp_$ym" # Step 1: # Create directory to store files -mkdir -p $path$ym +mkdir -p "$path" # Download files request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&consulta=GastosDiretos' -curl -o $path$ym/${1}${2}_GastosDiretos.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed +curl -o $path/${1}${2}_GastosDiretos.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed # Unzip them -unzip -o $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/ +unzip -o $path/${1}${2}_GastosDiretos.zip -d $path/ # Remove zip file -rm $path$ym/${1}${2}_GastosDiretos.zip +rm $path/${1}${2}_GastosDiretos.zip for key in "${!filter[@]}" do # Step 2: - ./create_expenses_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4 + ./create_expenses_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4 "${path}" # Step 3: - ./resume_expenses.sh "${path}" ${1}-${2} "${filter[$key]}" + ./resume_expenses.sh "${path}" ${1}-${2} "${filter[$key]}" "${columnId}" # Step 4: - logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv + logstash -f ${path}/config-${1}-${2} < ${path}/${1}${2}.csv # Data inserted, we can now remove it. - rm ../../data/expenses/processed/${1}${2}.csv + rm ${path}/${1}${2}.csv + rm ${path}/config-${1}-${2} done -rm $path${1}-${2}/${1}${2}_GastosDiretos.csv +rm $path/${1}${2}_GastosDiretos.csv +rmdir $path diff --git a/scripts/expenses/resume_expenses.sh b/scripts/expenses/resume_expenses.sh index de61364980f7b35746a9406ad01b69266c0810b4..0386d67166437e9599683e6aa703ae24a722388f 100755 --- a/scripts/expenses/resume_expenses.sh +++ b/scripts/expenses/resume_expenses.sh @@ -1,35 +1,22 @@ #!/bin/bash -# WARNING: This script should not be called directly. Look at 'insert_expenses.sh' before calling this script. - -# Input: First parameter is the path to data files and the second one is the date in the name of the files. Data files can be found in: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos -# Example: ./resume_expenses.sh ../../data/expenses/ 2016-11 - -# Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR). - -if [ "$#" -ne 3 ]; then - echo "Usage: $0 <path> <date> <filter>" - exit +if [ "$#" -ne 4 ]; then + echo "Usage $0 <path> <date> <filter> <column-id>" + echo "Example: $0 ./tmp_201612 201612 MEC 2" + exit fi -# Path example: ../../data/expenses/ path=$1 -# Date example: 2016-11 date=$2 -# Filter example: UNIVERSIDADE FEDERAL DO PARANA filter=$3 -# dateWithoutHyphen example: 201611 -dateWithoutHyphen=${date//-} +columnId=$4 -input="${path}${date}/${dateWithoutHyphen}_GastosDiretos.csv" -output="${path}processed/${dateWithoutHyphen}.csv" - -if [ ! -d "${path}processsed" ]; then - mkdir -p "${path}processed" -fi +dateWithoutHyphen=${date//-} +cmd="\$$columnId == \"${filter}\"" -# About this command: -# - Grep removes irrelevant data. -w option forces to match the whole word, to avoid "UNIVERSIDADE FEDERAL DO PARA" from matching with "UNIVERSIDADE FEDERAL DO PARANA" -# - Tr removes null characters (ctrl + @). +# Input will probably look like: ./tmp_201612/201612_GastosDiretos.csv +input="${path}/${dateWithoutHyphen}_GastosDiretos.csv" +# Output will probably look like: ./tmp_201612/201612.csv +output="${path}/${dateWithoutHyphen}.csv" -cat "$input" | egrep -w --binary-files=text "$filter" | tr -d '\000' > "$output" +cat "${input}" | awk -F $'\t' "$cmd" > "$output" diff --git a/scripts/travel_allowances/config.sh b/scripts/travel_allowances/config.sh index 48598caf9f4aa84bceda5c028c87d07a55deba02..645a7315971f24af2ee7690495bc17314e19a17b 100644 --- a/scripts/travel_allowances/config.sh +++ b/scripts/travel_allowances/config.sh @@ -2,53 +2,20 @@ # Index prefix: The prefix of the index in elasticsearch. Ex: gastos -index="mec-gastos-diarias" +index="despesas-diarias" -# Filter: An associative array that will be used on 'egrep' to filter data to get only relevant universities. -# The key must be the university initials and the value should be the university name (equal to its name in Portal Transparencia's csv!). +# ColumnId: The number of the column from the CSV that we will use to filter data. + +columnId=2 + +# Filter: An associative array that will be used to filter data. The key should be the initials, and they will be used to generate the index name. +# The value should be the same as in the CSV, since it will be used to match data. declare -A filter filter=( - [ufal]="UNIVERSIDADE FEDERAL DE ALAGOAS" - [ufba]="UNIVERSIDADE FEDERAL DA BAHIA" - [ufc]="UNIVERSIDADE FEDERAL DO CEARA" - [ufes]="UNIVERSIDADE FEDERAL DO ESPIRITO SANTO" - [ufg]="UNIVERSIDADE FEDERAL DE GOIAS" - [uff]="UNIVERSIDADE FEDERAL FLUMINENSE" - [ufjf]="UNIVERSIDADE FEDERAL DE JUIZ DE FORA" - [ufmg]="UNIVERSIDADE FEDERAL DE MINAS GERAIS" - [ufpa]="UNIVERSIDADE FEDERAL DO PARA" - [ufpb]="UNIVERSIDADE FEDERAL DA PARAIBA" - [ufpr]="UNIVERSIDADE FEDERAL DO PARANA" - [ufpe]="UNIVERSIDADE FEDERAL DE PERNAMBUCO" - [ufrn]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE" - [ufrgs]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL" - [ufrj]="UNIVERSIDADE FEDERAL DO RIO DE JANEIRO" - [ufsc]="UNIVERSIDADE FEDERAL DE SANTA CATARINA" - [ufsm]="UNIVERSIDADE FEDERAL DE SANTA MARIA" - [ufrpe]="UNIVERSIDADE FEDERAL RURAL DE PERNAMBUCO" - [ufrrj]="UNIVERSIDADE FEDERAL RURAL DO RIO DE JANEIRO" - [ufrr]="UNIVERSIDADE FEDERAL DE RORAIMA" - [ufcg]="UNIVERSIDADE FEDERAL DE CAMPINA GRANDE" - [ufra]="UNIVERSIDADE FEDERAL RURAL DA AMAZONIA" - [uftm]="UNIVERSIDADE FEDERAL DO TRIANGULO MINEIRO" - [ufvjm]="UNIVERSIDADE FED.VALES JEQUITINHONHA E MUCURI" - [utfpr]="UNIVERSIDADE TECNOLOGICA FEDERAL DO PARANA" - [unifal]="UNIVERSIDADE FEDERAL DE ALFENAS" - [unifei]="UNIVERSIDADE FEDERAL DE ITAJUBA - MG" - [unifesp]="UNIVERSIDADE FEDERAL DE SÃO PAULO" - [ufla]="UNIVERSIDADE FEDERAL DE LAVRAS" - [ufersa]="UNIVERSIDADE FEDERAL RURAL DO SEMI-ARIDO" - [unirio]="UNIVERSIDADE FEDERAL DO ESTADO RIO DE JANEIRO" - [furg]="UNIVERSIDADE FEDERAL DO RIO GRANDE - FURG" - [ufrb]="UNIVERSIDADE FEDERAL DO RECONCAVO DA BAHIA" - [uffs]="UNIVERSIDADE FEDERAL DA FRONTEIRA SUL" - [ufopa]="UNIVERSIDADE FEDERAL DO OESTE DO PARA" - [ufob]="UNIVERSIDADE FEDERAL DO OESTE DA BAHIA - UFOB" - [ufca]="UNIVERSIDADE FEDERAL DO CARIRI - UFCA" - [ufsb]="UNIVERSIDADE FEDERAL DO SUL DA BAHIA - UFESBA" + [mec]="MINISTERIO DA EDUCACAO" ) -# Host: ElasticSearch's host. Examples: "localhost" +# Host: ElasticSearch's host. Ex: "localhost" host="localhost" diff --git a/scripts/travel_allowances/create_travel_allowance_config.py b/scripts/travel_allowances/create_travel_allowance_config.py index 11f340ff3236574a4148f20284fd36461905e902..e1a283e7af7caedb9c4fcfdb553555480a8eacfc 100755 --- a/scripts/travel_allowances/create_travel_allowance_config.py +++ b/scripts/travel_allowances/create_travel_allowance_config.py @@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 9: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>") +if len(sys.argv) != 10: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <entity> <username> <password> <path>") sys.exit() with open('logstash_config.example') as infile: @@ -19,11 +19,11 @@ with open('logstash_config.example') as infile: output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' , "date": sys.argv[1] + '-' + sys.argv[2] - , "index": sys.argv[4] + , "index": sys.argv[4] + '-' + sys.argv[6] , "host": sys.argv[5] - , "university": sys.argv[6] , "user": sys.argv[7] , "password": sys.argv[8] } -with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: +date = sys.argv[1] + '-' + sys.argv[2] +with open(sys.argv[9] + '/config-' + date, 'w') as outfile: outfile.write(output) diff --git a/scripts/travel_allowances/insert_travel_allowances.sh b/scripts/travel_allowances/insert_travel_allowances.sh index fe855f92a0484ad2a870643b5e62ed1f0ae49d9f..5e4e7adfba41c9ec7be093f4b5c5bd31a298cd58 100755 --- a/scripts/travel_allowances/insert_travel_allowances.sh +++ b/scripts/travel_allowances/insert_travel_allowances.sh @@ -17,8 +17,9 @@ if [ "$#" -ne 4 ]; then exit fi -source config.sh +source ./config.sh +# Check if all variables in config file are set: if [ -z ${index+x} ]; then echo "Var 'index' is unset. Set it in file 'scripts/travel_allowance/config.sh'."; exit; @@ -27,6 +28,7 @@ if [ -z ${host+x} ]; then echo "Var 'host' is unset. Set it in file 'scripts/travel_allowance/config.sh'."; exit; fi + size=${#filter[@]} if [ "$size" -lt 1 ]; then echo "Var 'filter' is unset. Set it in file 'scripts/expenses/config.sh'."; @@ -42,39 +44,34 @@ temp=$(date -d "${aux}01") day=$(date -d "$temp - 1 day" "+%d") ym=$1-$2 -dataPath="../../data/" -path="../../data/travel_allowance/" -configPath="../../configs/travel_allowance/logstash/" - -if [ ! -d "$path/processed" ]; then - mkdir -p "$path/processed" -fi -if [ ! -d "$configPath" ]; then - mkdir -p "$configPath" -fi +path="./tmp_$ym" # Step 1: # Create directory to store files -mkdir -p $path$ym +mkdir -p "$path" # Download files request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&consulta=Diarias' -curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Diarias.zip +curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path/${1}${2}_Diarias.zip # Unzip them -unzip -o $path$ym/${1}${2}_Diarias.zip -d $path$ym/ +unzip -o $path/${1}${2}_Diarias.zip -d $path/ # Remove zip file -rm $path$ym/${1}${2}_Diarias.zip +rm $path/${1}${2}_Diarias.zip for key in "${!filter[@]}" do # Step 2: - ./create_travel_allowance_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4 + ./create_travel_allowance_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4 "${path}" # Step 3: - ./resume_travel_allowance.sh "$path" ${1}-${2} "${filter[$key]}" + ./resume_travel_allowance.sh "$path" ${1}-${2} "${filter[$key]}" "${columnId}" # Step 4: - logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv + logstash -f ${path}/config-${1}-${2} < ${path}/${1}${2}.csv # Remove processed file - rm ${path}processed/${1}${2}.csv + rm ${path}/${1}${2}.csv + rm ${path}/config-${1}-${2} done + +rm $path/${1}${2}_Diarias.csv +rmdir $path diff --git a/scripts/travel_allowances/resume_travel_allowance.sh b/scripts/travel_allowances/resume_travel_allowance.sh index 50afa4c95fe7099b4c93f4a6f4b0ae5e9ae0c558..fc15edc6df115f39462c57b0992e406365ef9d26 100755 --- a/scripts/travel_allowances/resume_travel_allowance.sh +++ b/scripts/travel_allowances/resume_travel_allowance.sh @@ -1,34 +1,22 @@ #!/bin/bash -# WARNING: This script should not be called directly. Look at 'insert_travel_allowance.sh' before calling this script. - -# Input: First parameter is the path to data files and the second one is the date in the name of the files. Data files can be found in: http://transparencia.gov.br/downloads/mensal.asp?c=Diarias -# Example: ./resume_travel_allowance.sh ../../data/travel_allowance/ 2016-11 - -# Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR). +if [ "$#" -ne 4 ]; then + echo "Usage $0 <path> <date> <filter> <column-id>" + echo "Example: $0 ./tmp_201612 201612 MEC 2" + exit +fi -# Path example: ../../data/travel_allowance/ path=$1 -# Date example: 2016-11 date=$2 -# dateWithoutHyphen example: 201611 -dateWithoutHyphen=${date//-} filter=$3 +columnId=$4 -if [ "$#" -ne 3 ]; then - echo "Usage: $0 <path> <date> <filter>" - exit -fi - -input="${path}${date}/${dateWithoutHyphen}_Diarias.csv" -output="${path}processed/${dateWithoutHyphen}.csv" - -if [ ! -d "${path}processsed" ]; then - mkdir -p "${path}processed" -fi +dateWithoutHyphen=${date//-} +cmd="\$$columnId == \"${filter}\"" -# About this command: -# - Grep removes irrelevant data. -w option forces to match the whole word, to avoid "UNIVERSIDADE FEDERAL DO PARA" from matching with "UNIVERSIDADE FEDERAL DO PARANA" -# - Tr removes null characters (ctrl + @). +# Input will probably look like: ./tmp_201612/201612_GastosDiretos.csv +input="${path}/${dateWithoutHyphen}_Diarias.csv" +# Output will probably look like: ./tmp_201612/201612.csv +output="${path}/${dateWithoutHyphen}.csv" -cat "$input" | egrep -w --binary-files=text "$filter" | tr -d '\000' > "$output" +cat "${input}" | awk -F $'\t' "$cmd" > "$output" diff --git a/scripts/workers/config.json.example b/scripts/workers/config.json.example index 01a2d48ca65308ac21226077ced011399a3f074d..473c070363517683de76c76634751eb45a65a2f6 100644 --- a/scripts/workers/config.json.example +++ b/scripts/workers/config.json.example @@ -1,5 +1,5 @@ { - "path": "Dados_Servidores/2016-10/" + "path": "tmp_2016-10/" , "date": "20161031" , "file1" : "_Remuneracao.csv" , "file2" : "_Cadastro_Unique.csv" @@ -8,5 +8,5 @@ , "quotechar": "\"" , "delimiter": "\t" , "lineterminator": "\n" - , "outputFile": "Dados_Servidores/Processados/201610.csv" + , "outputFile": "tmp_/201610.csv" } diff --git a/scripts/workers/config.sh b/scripts/workers/config.sh index ba98213d81b4247d646dbc12b222c2860fa301cb..3d5d472c73be9b0d95fc796151fcdc00a6e36863 100644 --- a/scripts/workers/config.sh +++ b/scripts/workers/config.sh @@ -2,51 +2,18 @@ # Index prefix: The prefix of the index in elasticsearch. Ex: gastos -index="mec-servidores" +index="servidores" -# Filter: An associative array that will be used on 'egrep' to filter data to get only relevant universities. -# The key must be the university initials and the value should be the university name (equal to its name in Portal Transparencia's csv!). +# ColumnName: The name of the column from the CSV that we will use to filter data. + +columnName="ORGSUP_LOTACAO" + +# Filter: An associative array that will be used to filter data. The key should be the initials, and they will be used to generate the index name. +# The value should be the same as in the CSV, since it will be used to match data. declare -A filter filter=( - [ufal]="UNIVERSIDADE FEDERAL DE ALAGOAS" - [ufba]="UNIVERSIDADE FEDERAL DA BAHIA" - [ufc]="UNIVERSIDADE FEDERAL DO CEARA" - [ufes]="UNIVERSIDADE FEDERAL DO ESPIRITO SANTO" - [ufg]="UNIVERSIDADE FEDERAL DE GOIAS" - [uff]="UNIVERSIDADE FEDERAL FLUMINENSE" - [ufjf]="UNIVERSIDADE FEDERAL DE JUIZ DE FORA" - [ufmg]="UNIVERSIDADE FEDERAL DE MINAS GERAIS" - [ufpa]="UNIVERSIDADE FEDERAL DO PARA" - [ufpb]="UNIVERSIDADE FEDERAL DA PARAIBA" - [ufpr]="UNIVERSIDADE FEDERAL DO PARANA" - [ufpe]="UNIVERSIDADE FEDERAL DE PERNAMBUCO" - [ufrn]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE" - [ufrgs]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL" - [ufrj]="UNIVERSIDADE FEDERAL DO RIO DE JANEIRO" - [ufsc]="UNIVERSIDADE FEDERAL DE SANTA CATARINA" - [ufsm]="UNIVERSIDADE FEDERAL DE SANTA MARIA" - [ufrpe]="UNIVERSIDADE FEDERAL RURAL DE PERNAMBUCO" - [ufrrj]="UNIVERSIDADE FEDERAL RURAL DO RIO DE JANEIRO" - [ufrr]="UNIVERSIDADE FEDERAL DE RORAIMA" - [ufcg]="UNIVERSIDADE FEDERAL DE CAMPINA GRANDE" - [ufra]="UNIVERSIDADE FEDERAL RURAL DA AMAZONIA" - [uftm]="UNIVERSIDADE FEDERAL DO TRIANGULO MINEIRO" - [ufvjm]="UNIVERSIDADE FED.VALES JEQUITINHONHA E MUCURI" - [utfpr]="UNIVERSIDADE TECNOLOGICA FEDERAL DO PARANA" - [unifal]="UNIVERSIDADE FEDERAL DE ALFENAS" - [unifei]="UNIVERSIDADE FEDERAL DE ITAJUBA - MG" - [unifesp]="UNIVERSIDADE FEDERAL DE SÃO PAULO" - [ufla]="UNIVERSIDADE FEDERAL DE LAVRAS" - [ufersa]="UNIVERSIDADE FEDERAL RURAL DO SEMI-ARIDO" - [unirio]="UNIVERSIDADE FEDERAL DO ESTADO RIO DE JANEIRO" - [furg]="UNIVERSIDADE FEDERAL DO RIO GRANDE - FURG" - [ufrb]="UNIVERSIDADE FEDERAL DO RECONCAVO DA BAHIA" - [uffs]="UNIVERSIDADE FEDERAL DA FRONTEIRA SUL" - [ufopa]="UNIVERSIDADE FEDERAL DO OESTE DO PARA" - [ufob]="UNIVERSIDADE FEDERAL DO OESTE DA BAHIA - UFOB" - [ufca]="UNIVERSIDADE FEDERAL DO CARIRI - UFCA" - [ufsb]="UNIVERSIDADE FEDERAL DO SUL DA BAHIA - UFESBA" + [mec]="MINISTERIO DA EDUCACAO" ) # Host: ElasticSearch's host. Examples: "localhost" diff --git a/scripts/workers/create_config.py b/scripts/workers/create_config.py index 3206c52678e9854ed972db6a7e6ea3de01b42dd8..85ef0ad38e6eb9c0532812278196f4f662e3206c 100755 --- a/scripts/workers/create_config.py +++ b/scripts/workers/create_config.py @@ -12,12 +12,12 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 9: - print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>") +if len(sys.argv) != 10: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <entity> <username> <password> <path>") sys.exit() data = { - "path": "../../data/workers/" + sys.argv[1] + "-" + sys.argv[2] + "/" + "path": sys.argv[9] , "date": sys.argv[1] + sys.argv[2] + sys.argv[3] , "file1": "_Remuneracao.csv" , "file2": "_Cadastro_Unique.csv" @@ -26,10 +26,10 @@ data = { , "quotechar": "\"" , "delimiter": "\t" , "lineterminator": "\n" - , "outputFile": "../../data/workers/processed/" + sys.argv[1] + sys.argv[2] + ".csv" + , "outputFile": sys.argv[9] + '/' + sys.argv[1] + sys.argv[2] + sys.argv[3] + ".csv" } -with open('../../configs/workers/json/config-' + sys.argv[1] + '-' + sys.argv[2] + '.json', 'w') as outfile: +with open(sys.argv[9] + '/config-' + sys.argv[1] + '-' + sys.argv[2] + '.json', 'w') as outfile: json.dump(data, outfile, indent=4, sort_keys=True) if int(sys.argv[1]) <= 2014 or (int(sys.argv[1]) == 2015 and int(sys.argv[2]) <= 3): @@ -41,11 +41,10 @@ else: output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' , "date": sys.argv[1] + '-' + sys.argv[2] - , "index": sys.argv[4] + , "index": sys.argv[4] + sys.argv[6] , "host": sys.argv[5] - , "university": sys.argv[6] , "user": sys.argv[7] , "password": sys.argv[8] } -with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: +with open(sys.argv[9] + '/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: outfile.write(output) diff --git a/scripts/workers/insert_register_payment.sh b/scripts/workers/insert_register_payment.sh index 0838d813e8c41b7abca21ad79306afad913c4905..16e11ed2cafd254a1d78eceffb5f4a3085ad8995 100755 --- a/scripts/workers/insert_register_payment.sh +++ b/scripts/workers/insert_register_payment.sh @@ -22,7 +22,7 @@ if [ "$#" -ne 4 ]; then exit fi -source config.sh +source ./config.sh if [ -z ${index+x} ]; then echo "Var 'index' is unset. Set it in file 'scripts/workers/config.sh'."; @@ -32,6 +32,11 @@ if [ -z ${host+x} ]; then echo "Var 'host' is unset. Set it in file 'scripts/workers/config.sh'."; exit; fi +if [ -z ${columnName+x} ]; then + echo "Var 'columnName' is unset. Set it in file 'scripts/workers/config.sh'."; + exit; +fi + size=${#filter[@]} if [ "$size" -lt 1 ]; then echo "Var 'filter' is unset. Set it in file 'scripts/expenses/config.sh'."; @@ -39,55 +44,51 @@ if [ "$size" -lt 1 ]; then fi ym=$1-$2 -dataPath="../../data/" -path="../../data/workers/" -configPath="../../configs/workers/" - -# Check if Data and Workers directories already exist: -if [ ! -d "$path" ]; then - mkdir -p "$path" -fi -if [ ! -d "$configPath/json" ]; then - mkdir -p "$configPath/json" -fi -if [ ! -d "$configPath/logstash" ]; then - mkdir -p "$configPath/logstash" -fi +path="./tmp_$ym" # Step 1: # Create directory to store files -mkdir -p $path$ym -mkdir -p ${path}processed/ +mkdir -p "$path" # Download files request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&d=C&consulta=Servidores' -curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_ 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Servidores.zip +curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_ 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path/${1}${2}_Servidores.zip # Unzip them -unzip -o $path$ym/${1}${2}_Servidores.zip -d $path$ym/ +unzip -o $path/${1}${2}_Servidores.zip -d $path/ # Remove zip file -rm $path$ym/${1}${2}_Servidores.zip +rm $path/${1}${2}_Servidores.zip # Get day -day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8) +day=$(ls $path | grep -m 1 $1$2 | cut -c 7,8) for key in "${!filter[@]}" do # Step 2: # Create config files - ./create_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4 + ./create_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4 "${path}" # Step 3: # Start processing aux=$( echo "${filter[$key]}" | sed 's/ /\\ /g' ) - ./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json "$aux" - rm $path$ym/${1}${2}${day}_Cadastro_Unique.csv + ./merge_files_es.py $path/config-${1}-${2}.json "$aux" "${columnName}" + rm $path/${1}${2}${day}_Cadastro_Unique.csv # Step 4: # Insert data in ElasticSearch - logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv + logstash -f $path/config-${1}-${2} < $path/${1}${2}${day}.csv # Remove data - rm ../../data/workers/processed/${1}${2}.csv + rm -f $path/config-${1}-${2} + rm -f $path/config-${1}-${2}.json + rm -f $path/${1}${2}${day}.csv done + +rm -f $path/${1}${2}${day}_Afastamentos.csv +rm -f $path/${1}${2}${day}_Cadastro.csv +rm -f $path/${1}${2}${day}_Honorarios\(Jetons\).csv +rm -f $path/${1}${2}${day}_Jetom.csv +rm -f $path/${1}${2}${day}_Observacoes.csv +rm -f $path/${1}${2}${day}_Remuneracao.csv +rmdir $path diff --git a/scripts/workers/merge_files_es.py b/scripts/workers/merge_files_es.py index 5aad3749997a6178b6fb9c94f60825fd987d861b..22ad435966d62c732260d17c93cbcac2eb542920 100755 --- a/scripts/workers/merge_files_es.py +++ b/scripts/workers/merge_files_es.py @@ -22,16 +22,16 @@ import sys, csv, json, math, subprocess from pathlib import Path from subprocess import call -if len(sys.argv) != 3: - print("Usage: " + sys.argv[0] + " <config.json> <filter>") +if len(sys.argv) != 4: + print("Usage: " + sys.argv[0] + " <config.json> <filter> <columnId>") sys.exit() with open(sys.argv[1]) as f: params = json.load(f) # Which files should be merged? -file1 = params['path'] + params['date'] + params['file1'] -file2 = params['path'] + params['date'] + params['file2'] +file1 = params['path'] + '/' + params['date'] + params['file1'] +file2 = params['path'] + '/' + params['date'] + params['file2'] # Which column in each file contains the common column? idPointColumn1 = params['idColumn1'] @@ -46,7 +46,7 @@ title1 = csv_1.pop(0) file_exists = Path(file2) if not file_exists.is_file(): print("File2 does not exist. Calling script resume_register to create it...") - call(["./resume_register.sh " + params['path'] + " " + params['date'] + " " + sys.argv[2]], shell=True) + call(["./resume_register.sh " + params['path'] + " " + params['date'] + " " + sys.argv[2] + " " + sys.argv[3]], shell=True) with open(file2, newline='', encoding='Windows-1252') as f: csv_2 = [ i for i in csv.reader(f, 'dialect') ] diff --git a/scripts/workers/resume_register.sh b/scripts/workers/resume_register.sh index bdd793b53c7058ac086c4fec25bd0b61c0d289bc..397c91f1d3b6db7f306a601cc2b9137500937ebb 100755 --- a/scripts/workers/resume_register.sh +++ b/scripts/workers/resume_register.sh @@ -2,33 +2,28 @@ # WARNING: This script should not be called if you dont know what you're doing! Look for 'merge_files_es.py'. -# This scripts purpose is to filter data and get only data related to UFPR. - -# Input: Path to data files and date from data files. -# Example (inserting data from 2016-10): ./resume_register.sh ../../data/workers/2016-10/ 20161031 - -# Output: CSV file named YearMonthDay_Cadastro_Ufpr_Unique.csv, in the $path folder. -# Example of CSV location (using same parameters as input): ../../data/workers/2016-10/20161031_Cadastro_Ufpr_Unique.csv +input="${path}${date}_Cadastro.csv" +output="${path}${date}_Cadastro_Unique.csv" -if [ "$#" -ne 3 ]; then - echo "Usage: $0 <path> <date> <filter>" - exit +if [ "$#" -ne 4 ]; then + echo "Usage $0 <path> <date> <filter> <column-name>" + echo "Example: $0 ./tmp_201612 201612 MEC 2" + exit fi path=$1 date=$2 filter=$3 -input="${path}${date}_Cadastro.csv" -output="${path}${date}_Cadastro_Unique.csv" +input="${path}/${date}_Cadastro.csv" +output="${path}/${date}_Cadastro_Unique.csv" -if [ ! -d "${path}" ]; then - mkdir -p "${path}" -fi +head -n1 ${input} > $path/header.csv +iconv -f WINDOWS-1252 -t UTF-8 -o $path/tmp.csv $path/header.csv +columnId=$(sed s/${4}.*$/${4}/ $path/tmp.csv | sed -e 's/\t/\n/g' | wc -l) +columnId=`expr $columnId + 1` +rm -f $path/tmp.csv $path/header.csv -# About this command: -# - Grep removes everyone that does not work in UFPR. -w option forces to match the whole word, to avoid "UNIVERSIDADE FEDERAL DO PARA" from matching with "UNIVERSIDADE FEDERAL DO PARANA" -# - Tr removes null characters (ctrl + @). +cmd="\$$columnId == \"${filter}\"" -# Get only data from UFPR. -cat "$input" | egrep -w --binary-files=text "$filter" | tr -d '\000' > "$output" +cat "${input}" | awk -F $'\t' "$cmd" | tr -d '\000' > "$output"