diff --git a/.gitignore b/.gitignore index 4f351d82b88900085c6aa8328d4ee2696c240f37..b9c6a02dca607ecb015b67f31f9df874ea4b2175 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ data configs +*.swp +*.swo diff --git a/scripts/expenses/README b/scripts/expenses/README new file mode 100644 index 0000000000000000000000000000000000000000..e37c4de03f715ab8c34063286b8c3a8eb7670222 --- /dev/null +++ b/scripts/expenses/README @@ -0,0 +1,7 @@ +The easiest way to insert expenses data is to use 'insert_expenses.sh'. + +Script's input: Year, month and day from the data to be inserted, ElasticSearch's user and password. The day should be the last day of the month. +Example: ./insert_expenses.sh 2016 10 31 myuser mypass +Example 2: ./insert_expenses.sh 2014 11 30 myuser mypass + +The other script's will be called by 'insert_expenses.sh' correctly. diff --git a/scripts/expenses/insert_expenses.sh b/scripts/expenses/insert_expenses.sh index 862b9ae2b4a4daf59f88fff1350cf573088b6172..a826346ee77c18cdb2179bfed074e0180ea0ede7 100755 --- a/scripts/expenses/insert_expenses.sh +++ b/scripts/expenses/insert_expenses.sh @@ -2,12 +2,13 @@ # This script is the one that should be called to insert data from one month. -# Input: Year, month and day from the data to be inserted, ElasticSearch's user and password. +# Input: Year, month and day from the data to be inserted, ElasticSearch's user and password. The day should be the last day of the month. # Example: ./insert_expenses.sh 2016 10 31 myuser mypass -# It has 3 steps: -# 1- Generate logstash config file via create_expenses_config.py. -# 2- Generate CSV with only UFPR data via resume_expenses.sh, which is stored in transparencia/data/expenses/processed/year-month.csv -# 3- Insert data in ElasticSearch via logstash, using the config file created and the CSV created by resume_expenses.sh. +# It has 4 steps: +# 1- Download files and put them in the right location. +# 2- Generate logstash config file via create_expenses_config.py. +# 3- Generate a CSV with only UFPR data via resume_expenses.sh, which is stored in transparencia/data/expenses/processed/year-month.csv +# 4- Insert data in ElasticSearch via logstash, using the config file created and the CSV created by resume_expenses.sh. # Output: The commands/scripts outputs. if [ "$#" -ne 5 ]; then @@ -16,6 +17,35 @@ if [ "$#" -ne 5 ]; then exit fi + +ym=$1-$2 +dataPath="../../data/" +path="../../data/expenses/" + +if [ ! -d "$dataPath" ]; then + mkdir "$dataPath" +fi +if [ ! -d "$path" ]; then + mkdir "$path" +fi + +# Step 1: +# Create directory to store files +mkdir $path$ym + +# Download files +request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&consulta=GastosDiretos' +curl -o $path$ym/${1}${2}_GastosDiretos.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed + +# Unzip them +unzip $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/ + +# Remove zip file +rm $path$ym/${1}${2}_GastosDiretos.zip + +# Step 2: ./create_expenses_config.py $1 $2 $3 $4 $5 +# Step 3: ./resume_expenses.sh ../../data/expenses/ ${1}-${2} +# Step 4: logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv diff --git a/scripts/expenses/resume_expenses.sh b/scripts/expenses/resume_expenses.sh index fa89c3e44d817356f5d3a71f4f4847b48c0f8fe4..7ffe50f2f52d6f80d82e6ee74d4d10bdab387f2d 100755 --- a/scripts/expenses/resume_expenses.sh +++ b/scripts/expenses/resume_expenses.sh @@ -21,7 +21,7 @@ fi echo "Processing data with args = $path and ${date}" -input="${path}${date}/AplicacoesDiretasDespesaOrgao.csv" +input="${path}${date}/${dateWithoutHyphen}_GastosDiretos.csv" output="${path}processed/${dateWithoutHyphen}.csv" # About this command: diff --git a/scripts/expenses/unzip.sh b/scripts/expenses/unzip.sh index eb5696efa1bcdc8b39abeee2f635474e32000889..56e917fb9399d2a12e08df19eb736d0c4494620c 100755 --- a/scripts/expenses/unzip.sh +++ b/scripts/expenses/unzip.sh @@ -3,7 +3,7 @@ # This scripts gets a zip file in ~/Downloads, moves it to a folder in path (probably transparencia/data/expenses), unzips it and removes the zip file. # Input: Date (year and month, separated by hyphen). -# Ex: ./unzip.sh 2015-12 201512 +# Ex: ./unzip.sh 2015-12 if [ "$#" -ne 2 ]; then echo "Usage $0 <date>" diff --git a/scripts/workers/README b/scripts/workers/README new file mode 100644 index 0000000000000000000000000000000000000000..190ab41c8aca88dff24f803dc11c0a77df907aca --- /dev/null +++ b/scripts/workers/README @@ -0,0 +1,7 @@ +The easiest way to insert expenses data is to use 'insert_register_payment.sh'. + +Script's input: Year and Month from CSV file, ElasticSearch's user and password. +Example (inserting data from file 20130930_Cadastro.csv): ./insert_register_payment.sh 2013 09 myuser mypassword +If you want to look at more examples, check add_registers.sh. + +The other script's will be called by 'insert_register_payment.sh' correctly. diff --git a/scripts/workers/add_registers.sh b/scripts/workers/add_registers.sh index 90a236eba927bee76d8e43cf3972087b435a8be4..957a2a7c5856240c292f3fcabab3d2002f2f374d 100755 --- a/scripts/workers/add_registers.sh +++ b/scripts/workers/add_registers.sh @@ -10,53 +10,53 @@ if [ "$#" -ne 2 ]; then exit fi -./insert_register_payment.sh 2016 11 30 $1 $2 -./insert_register_payment.sh 2016 10 31 $1 $2 -./insert_register_payment.sh 2016 09 30 $1 $2 -./insert_register_payment.sh 2016 08 31 $1 $2 -./insert_register_payment.sh 2016 07 31 $1 $2 -./insert_register_payment.sh 2016 06 30 $1 $2 -./insert_register_payment.sh 2016 05 31 $1 $2 -./insert_register_payment.sh 2016 04 30 $1 $2 -./insert_register_payment.sh 2016 03 31 $1 $2 -./insert_register_payment.sh 2016 02 29 $1 $2 -./insert_register_payment.sh 2016 01 31 $1 $2 +./insert_register_payment.sh 2016 11 $1 $2 +./insert_register_payment.sh 2016 10 $1 $2 +./insert_register_payment.sh 2016 09 $1 $2 +./insert_register_payment.sh 2016 08 $1 $2 +./insert_register_payment.sh 2016 07 $1 $2 +./insert_register_payment.sh 2016 06 $1 $2 +./insert_register_payment.sh 2016 05 $1 $2 +./insert_register_payment.sh 2016 04 $1 $2 +./insert_register_payment.sh 2016 03 $1 $2 +./insert_register_payment.sh 2016 02 $1 $2 +./insert_register_payment.sh 2016 01 $1 $2 -./insert_register_payment.sh 2015 12 31 $1 $2 -./insert_register_payment.sh 2015 11 30 $1 $2 -./insert_register_payment.sh 2015 10 31 $1 $2 -./insert_register_payment.sh 2015 09 30 $1 $2 -./insert_register_payment.sh 2015 08 31 $1 $2 -./insert_register_payment.sh 2015 07 31 $1 $2 -./insert_register_payment.sh 2015 06 30 $1 $2 -./insert_register_payment.sh 2015 05 31 $1 $2 -./insert_register_payment.sh 2015 04 30 $1 $2 -./insert_register_payment.sh 2015 03 31 $1 $2 -./insert_register_payment.sh 2015 02 28 $1 $2 -./insert_register_payment.sh 2015 01 31 $1 $2 +./insert_register_payment.sh 2015 12 $1 $2 +./insert_register_payment.sh 2015 11 $1 $2 +./insert_register_payment.sh 2015 10 $1 $2 +./insert_register_payment.sh 2015 09 $1 $2 +./insert_register_payment.sh 2015 08 $1 $2 +./insert_register_payment.sh 2015 07 $1 $2 +./insert_register_payment.sh 2015 06 $1 $2 +./insert_register_payment.sh 2015 05 $1 $2 +./insert_register_payment.sh 2015 04 $1 $2 +./insert_register_payment.sh 2015 03 $1 $2 +./insert_register_payment.sh 2015 02 $1 $2 +./insert_register_payment.sh 2015 01 $1 $2 -./insert_register_payment.sh 2014 12 31 $1 $2 -./insert_register_payment.sh 2014 11 30 $1 $2 -./insert_register_payment.sh 2014 10 31 $1 $2 -./insert_register_payment.sh 2014 09 30 $1 $2 -./insert_register_payment.sh 2014 08 31 $1 $2 -./insert_register_payment.sh 2014 07 31 $1 $2 -./insert_register_payment.sh 2014 06 30 $1 $2 -./insert_register_payment.sh 2014 05 31 $1 $2 -./insert_register_payment.sh 2014 04 30 $1 $2 -./insert_register_payment.sh 2014 03 31 $1 $2 -./insert_register_payment.sh 2014 02 28 $1 $2 -./insert_register_payment.sh 2014 01 31 $1 $2 +./insert_register_payment.sh 2014 12 $1 $2 +./insert_register_payment.sh 2014 11 $1 $2 +./insert_register_payment.sh 2014 10 $1 $2 +./insert_register_payment.sh 2014 09 $1 $2 +./insert_register_payment.sh 2014 08 $1 $2 +./insert_register_payment.sh 2014 07 $1 $2 +./insert_register_payment.sh 2014 06 $1 $2 +./insert_register_payment.sh 2014 05 $1 $2 +./insert_register_payment.sh 2014 04 $1 $2 +./insert_register_payment.sh 2014 03 $1 $2 +./insert_register_payment.sh 2014 02 $1 $2 +./insert_register_payment.sh 2014 01 $1 $2 -./insert_register_payment.sh 2013 12 31 $1 $2 -./insert_register_payment.sh 2013 11 30 $1 $2 -./insert_register_payment.sh 2013 10 31 $1 $2 -./insert_register_payment.sh 2013 09 30 $1 $2 -./insert_register_payment.sh 2013 08 31 $1 $2 -./insert_register_payment.sh 2013 07 31 $1 $2 -./insert_register_payment.sh 2013 06 30 $1 $2 -./insert_register_payment.sh 2013 05 31 $1 $2 -./insert_register_payment.sh 2013 04 30 $1 $2 -./insert_register_payment.sh 2013 03 31 $1 $2 -./insert_register_payment.sh 2013 02 28 $1 $2 -./insert_register_payment.sh 2013 01 31 $1 $2 +./insert_register_payment.sh 2013 12 $1 $2 +./insert_register_payment.sh 2013 11 $1 $2 +./insert_register_payment.sh 2013 10 $1 $2 +./insert_register_payment.sh 2013 09 $1 $2 +./insert_register_payment.sh 2013 08 $1 $2 +./insert_register_payment.sh 2013 07 $1 $2 +./insert_register_payment.sh 2013 06 $1 $2 +./insert_register_payment.sh 2013 05 $1 $2 +./insert_register_payment.sh 2013 04 $1 $2 +./insert_register_payment.sh 2013 03 $1 $2 +./insert_register_payment.sh 2013 02 $1 $2 +./insert_register_payment.sh 2013 01 $1 $2 diff --git a/scripts/workers/insert_register_payment.sh b/scripts/workers/insert_register_payment.sh index 24c3cf0d257f00fdb135e2690fc8ebceefd544e4..3b06efd8e53160492d57faadec13e8117694edc5 100755 --- a/scripts/workers/insert_register_payment.sh +++ b/scripts/workers/insert_register_payment.sh @@ -1,20 +1,65 @@ #!/bin/bash # Script to help using other scripts. Note that calling it to a data that has already been inserted will DUPLICATE it (which we probably dont want). -# This scripts does 3 things: -# 1- Create config files via create_config.py -# 2- Merge CSV data and create a new CSV file via merge_files_es.py. -# 3- Insert CSV file generated in step 2 into ElasticSearch via Logstash. -# Input: Year, Month and Day from CSV file, ElasticSearch's user and password. -# Example (inserting data from file 20130930_Cadastro.csv): ./insert_register_payment.sh 2013 09 30 myuser mypassword + +# This scripts does 4 things: +# 1- Download required files and store them in the right place. +# 2- Create config files via create_config.py +# 3- Merge CSV data and create a new CSV file via merge_files_es.py. +# 4- Insert CSV file generated in step 2 into ElasticSearch via Logstash. + +# Input: Year, Month from CSV file, ElasticSearch's user and password. +# Example (inserting data from file 20130930_Cadastro.csv): ./insert_register_payment.sh 2013 09 myuser mypassword +# If you want to look at more examples, check add_registers.sh. + # Output: The same output as the scripts and commands called. -if [ "$#" -ne 5 ]; then - echo "Usage: $0 <year> <month> <day> <user> <password>" - echo "Example: $0 2016 12 01 myuser mypassword" +# WARNING: We get the day from the CSV file by using cut in characters 7 and 8. This means we assume they will write something like 01 as day 1. If they change it to 1, this script will not work! + +if [ "$#" -ne 4 ]; then + echo "Usage: $0 <year> <month> <user> <password>" + echo "Example: $0 2016 12 myuser mypassword" exit fi -./create_config.py $1 $2 $3 $4 $5 +ym=$1-$2 +dataPath="../../data/" +path="../../data/workers/" + +# Check if Data and Workers directories already exist: +if [ ! -d "$dataPath" ]; then + mkdir "$dataPath" +fi +if [ ! -d "$path" ]; then + mkdir "$path" +fi + +# Step 1: +# Create directory to store files +mkdir $path$ym + +# Download files +request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&d=C&consulta=Servidores' +curl -o $path$ym/${1}${2}_Servidores.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_ 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed + +# Unzip them +unzip $path$ym/${1}${2}_Servidores.zip -d $path$ym/ + +# Remove zip file +rm $path$ym/${1}${2}_Servidores.zip + +# Get day +day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8) + + +# Step 2: +# Create config files +./create_config.py $1 $2 $day $3 $4 + +# Step 3: +# Start processing ./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json + +# Step 4: +# Insert data in ElasticSearch logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv