From 5d2b38e9c7a8170ca57d08bd0888c2215fb4c52f Mon Sep 17 00:00:00 2001 From: Cristian Weiland <cw14@inf.ufpr.br> Date: Tue, 14 Mar 2017 11:20:03 -0300 Subject: [PATCH] Issue #19: Add scripts to insert travel allowance data Signed-off-by: Cristian Weiland <cw14@inf.ufpr.br> --- scripts/travel_allowances/README | 7 ++ .../create_travel_allowance_config.py | 26 +++++++ scripts/travel_allowances/get_last_day.sh | 5 ++ .../insert_travel_allowances.sh | 62 +++++++++++++++++ .../travel_allowances/logstash_config.example | 49 +++++++++++++ .../process_travel_allowances.sh | 64 +++++++++++++++++ .../resume_travel_allowance.sh | 33 +++++++++ scripts/travel_allowances/unzip.sh | 21 ++++++ scripts/travel_allowances/unzipCaller.sh | 68 +++++++++++++++++++ 9 files changed, 335 insertions(+) create mode 100644 scripts/travel_allowances/README create mode 100755 scripts/travel_allowances/create_travel_allowance_config.py create mode 100755 scripts/travel_allowances/get_last_day.sh create mode 100755 scripts/travel_allowances/insert_travel_allowances.sh create mode 100644 scripts/travel_allowances/logstash_config.example create mode 100755 scripts/travel_allowances/process_travel_allowances.sh create mode 100755 scripts/travel_allowances/resume_travel_allowance.sh create mode 100755 scripts/travel_allowances/unzip.sh create mode 100755 scripts/travel_allowances/unzipCaller.sh diff --git a/scripts/travel_allowances/README b/scripts/travel_allowances/README new file mode 100644 index 0000000..165b1f6 --- /dev/null +++ b/scripts/travel_allowances/README @@ -0,0 +1,7 @@ +The easiest way to insert travel allowance data is to use 'insert_travel_allowance.sh'. + +Script's input: Year and month from the data to be inserted, ElasticSearch's user and password. +Example: ./insert_travel_allowance.sh 2016 10 myuser mypass +Example 2: ./insert_travel_allowance.sh 2014 11 myuser mypass + +The other script's will be called by 'insert_travel_allowance.sh' correctly. diff --git a/scripts/travel_allowances/create_travel_allowance_config.py b/scripts/travel_allowances/create_travel_allowance_config.py new file mode 100755 index 0000000..10e00d0 --- /dev/null +++ b/scripts/travel_allowances/create_travel_allowance_config.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# WARNING: This script should not be called directly. Look at 'insert_travel_allowance.sh' before calling this script. + +# This script is used to create a Logstash Config file. + +# Input: year, month and day, ElasticSearch's username and password. + +import sys, csv, json, math, subprocess +from pathlib import Path +from subprocess import call + +if len(sys.argv) != 6: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>") + sys.exit() + +with open('logstash_config.example') as infile: + example = infile.read() + +output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' + , "date": sys.argv[1] + '-' + sys.argv[2] + , "user": sys.argv[4] + , "password": sys.argv[5] } + +with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: + outfile.write(output) diff --git a/scripts/travel_allowances/get_last_day.sh b/scripts/travel_allowances/get_last_day.sh new file mode 100755 index 0000000..7d5abe5 --- /dev/null +++ b/scripts/travel_allowances/get_last_day.sh @@ -0,0 +1,5 @@ +ym=$(date +%Y%m -d "$(date +%Y%m15) next month") +temp=$(date -d "${ym}01") +d=$(date -d "$temp - 1 day" "+%d") + +echo $d diff --git a/scripts/travel_allowances/insert_travel_allowances.sh b/scripts/travel_allowances/insert_travel_allowances.sh new file mode 100755 index 0000000..d850b38 --- /dev/null +++ b/scripts/travel_allowances/insert_travel_allowances.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# This script is the one that should be called to insert data from one month. + +# Input: Year, month and day from the data to be inserted, ElasticSearch's user and password. The day should be the last day of the month. +# Example: ./insert_travel_allowance.sh 2016 10 myuser mypass +# It has 4 steps: +# 1- Download files and put them in the right location. +# 2- Generate logstash config file via create_travel_allowance_config.py. +# 3- Generate a CSV with only UFPR data via resume_travel_allowance.sh, which is stored in transparencia/data/travel_allowance/processed/year-month.csv +# 4- Insert data in ElasticSearch via logstash, using the config file created and the CSV created by resume_travel_allowance.sh. +# Output: The commands/scripts outputs. + +if [ "$#" -ne 4 ]; then + echo "Usage: $0 <year> <month> <user> <password>" + echo "Example: $0 2016 12 myuser mypass" + exit +fi + +# Getting the Last day of this month (Using date 2016-05-15 as example): +# First, get next month (201606). +aux=$(date +%Y%m -d "$(date +%Y%m15) next month") +# Append day 01 (20160601). +temp=$(date -d "${aux}01") +# Remove 1 day: 20160531, get only day: 31. +day=$(date -d "$temp - 1 day" "+%d") + +ym=$1-$2 +dataPath="../../data/" +path="../../data/travel_allowance/" +configPath="../../configs/travel_allowance/logstash/" + +if [ ! -d "$dataPath" ]; then + mkdir "$dataPath" +fi +if [ ! -d "$path/processed" ]; then + mkdir -p "$path/processed" +fi +if [ ! -d "$configPath" ]; then + mkdir -p "$configPath" +fi + +# Step 1: +# Create directory to store files +mkdir $path$ym + +# Download files +request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&consulta=Diarias' +curl -o $path$ym/${1}${2}_Diarias.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed + +# Unzip them +unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/ + +# Remove zip file +rm $path$ym/${1}${2}_Diarias.zip + +# Step 2: +./create_travel_allowance_config.py $1 $2 $day $3 $4 +# Step 3: +./resume_travel_allowance.sh $path ${1}-${2} +# Step 4: +logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv diff --git a/scripts/travel_allowances/logstash_config.example b/scripts/travel_allowances/logstash_config.example new file mode 100644 index 0000000..50afa9d --- /dev/null +++ b/scripts/travel_allowances/logstash_config.example @@ -0,0 +1,49 @@ +input { + stdin { + codec => plain { + charset => "Windows-1252" + } + } +} + +filter { + csv { + columns => ["Código Órgão Superior","Nome Órgão Superior","Código Órgão","Nome Órgao","Código Unidade Gestora","Nome Unidade Gestora","Código Função","Nome Função","Código Subfunção","Nome Subfunção","Código Programa","Nome Programa","Código Ação","Nome Ação","Linguagem Cidadã","CPF Favorecido","Nome Favorecido","Número Documento","Gestão Pagamento","Data Pagamento","Valor"] + separator => " " + add_field => { "timestamp" => "%(timestamp)s" } + } + mutate { + convert => { "Código Órgão Superior" => "integer" } + convert => { "Código Órgão" => "integer" } + convert => { "Código Unidade Gestora" => "integer" } + convert => { "Código Grupo Despesa" => "integer" } + convert => { "Código Elemento Despesa" => "integer" } + convert => { "Código Função" => "integer" } + convert => { "Código Subfunção" => "integer" } + convert => { "Código Programa" => "integer" } + convert => { "Código Ação" => "integer" } + convert => { "Código Favorecido" => "integer" } + convert => { "Gestão Pagamento" => "integer" } + convert => { "Valor" => "float" } + } + date { + match => [ "timestamp", "dd/MM/YYYY HH:mm:ss", "ISO8601" ] + target => [ "@timestamp" ] + } + date { + match => [ "Data Pagamento", "dd/MM/YYYY" ] + target => [ "Data Pagamento Timestamp" ] + } +} + +output { + elasticsearch { + action => "index" + user => "%(user)s" + password => "%(password)s" + hosts => "localhost:9200" + index => "ufpr-gastos-diarias-%(date)s" + workers => 1 + } + stdout {} +} diff --git a/scripts/travel_allowances/process_travel_allowances.sh b/scripts/travel_allowances/process_travel_allowances.sh new file mode 100755 index 0000000..fc7bd7d --- /dev/null +++ b/scripts/travel_allowances/process_travel_allowances.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# WARNING: This script should not be called unless the database is erased. Its still here for 2 reasons: +# 1- Log: To know what months of data have been inserted. +# 2- Example: To give example of how to call script insert_travel_allowance.sh. + +# This script only calls insert_travel_allowance for all years and months. + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 <user> <password>" + echo "Example: $0 myuser mypass" + exit +fi + +./insert_travel_allowance.sh 2016 11 $1 $2 +./insert_travel_allowance.sh 2016 10 $1 $2 +./insert_travel_allowance.sh 2016 09 $1 $2 +./insert_travel_allowance.sh 2016 08 $1 $2 +./insert_travel_allowance.sh 2016 07 $1 $2 +./insert_travel_allowance.sh 2016 06 $1 $2 +./insert_travel_allowance.sh 2016 05 $1 $2 +./insert_travel_allowance.sh 2016 04 $1 $2 +./insert_travel_allowance.sh 2016 03 $1 $2 +./insert_travel_allowance.sh 2016 02 $1 $2 +./insert_travel_allowance.sh 2016 01 $1 $2 + +./insert_travel_allowance.sh 2015 12 $1 $2 +./insert_travel_allowance.sh 2015 11 $1 $2 +./insert_travel_allowance.sh 2015 10 $1 $2 +./insert_travel_allowance.sh 2015 09 $1 $2 +./insert_travel_allowance.sh 2015 08 $1 $2 +./insert_travel_allowance.sh 2015 07 $1 $2 +./insert_travel_allowance.sh 2015 06 $1 $2 +./insert_travel_allowance.sh 2015 05 $1 $2 +./insert_travel_allowance.sh 2015 04 $1 $2 +./insert_travel_allowance.sh 2015 03 $1 $2 +./insert_travel_allowance.sh 2015 02 $1 $2 +./insert_travel_allowance.sh 2015 01 $1 $2 + +./insert_travel_allowance.sh 2014 12 $1 $2 +./insert_travel_allowance.sh 2014 11 $1 $2 +./insert_travel_allowance.sh 2014 10 $1 $2 +./insert_travel_allowance.sh 2014 09 $1 $2 +./insert_travel_allowance.sh 2014 08 $1 $2 +./insert_travel_allowance.sh 2014 07 $1 $2 +./insert_travel_allowance.sh 2014 06 $1 $2 +./insert_travel_allowance.sh 2014 05 $1 $2 +./insert_travel_allowance.sh 2014 04 $1 $2 +./insert_travel_allowance.sh 2014 03 $1 $2 +./insert_travel_allowance.sh 2014 02 $1 $2 +./insert_travel_allowance.sh 2014 01 $1 $2 + +./insert_travel_allowance.sh 2013 12 $1 $2 +./insert_travel_allowance.sh 2013 11 $1 $2 +./insert_travel_allowance.sh 2013 10 $1 $2 +./insert_travel_allowance.sh 2013 09 $1 $2 +./insert_travel_allowance.sh 2013 08 $1 $2 +./insert_travel_allowance.sh 2013 07 $1 $2 +./insert_travel_allowance.sh 2013 06 $1 $2 +./insert_travel_allowance.sh 2013 05 $1 $2 +./insert_travel_allowance.sh 2013 04 $1 $2 +./insert_travel_allowance.sh 2013 03 $1 $2 +./insert_travel_allowance.sh 2013 02 $1 $2 +./insert_travel_allowance.sh 2013 01 $1 $2 diff --git a/scripts/travel_allowances/resume_travel_allowance.sh b/scripts/travel_allowances/resume_travel_allowance.sh new file mode 100755 index 0000000..22c302a --- /dev/null +++ b/scripts/travel_allowances/resume_travel_allowance.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# WARNING: This script should not be called directly. Look at 'insert_travel_allowance.sh' before calling this script. + +# Input: First parameter is the path to data files and the second one is the date in the name of the files. Data files can be found in: http://transparencia.gov.br/downloads/mensal.asp?c=Diarias +# Example: ./resume_travel_allowance.sh ../../data/travel_allowance/ 2016-11 + +# Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR). + +# Path example: ../../data/travel_allowance/ +path=$1 +# Date example: 2016-11 +date=$2 +# dateWithoutHyphen example: 201611 +dateWithoutHyphen=${date//-} + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 <path> <date>" + exit +fi + +echo "Processing data with args = $path and ${date}" + +input="${path}${date}/${dateWithoutHyphen}_Diarias.csv" +output="${path}processed/${dateWithoutHyphen}.csv" + +# About this command: +# - Grep removes everyone that does not work in UFPR. +# - Tr removes null characters (ctrl + @). +# - Head -n1 gets first line (column names). Then, I append the data. + +head -n1 $input > $output +cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output diff --git a/scripts/travel_allowances/unzip.sh b/scripts/travel_allowances/unzip.sh new file mode 100755 index 0000000..5778a05 --- /dev/null +++ b/scripts/travel_allowances/unzip.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# This scripts gets a zip file in ~/Downloads, moves it to a folder in path (probably transparencia/data/travel_allowance), unzips it and removes the zip file. + +# Input: Date (year and month, separated by hyphen). +# Ex: ./unzip.sh 2015-12 + +if [ "$#" -ne 2 ]; then + echo "Usage $0 <date>" + exit +fi + +date=$1 +path="../../data/travel_allowance/" +dateWithoutHyphen=${date//-} + +mkdir $path$date +mv ~/Downloads/${dateWithoutHyphen}_Diarias.zip $path$date +unzip $path$date/${dateWithoutHyphen}_Diarias.zip +mv $path${dateWithoutHyphen}_Diarias.csv $path$date +rm $path$date/${dateWithoutHyphen}_Diarias.zip diff --git a/scripts/travel_allowances/unzipCaller.sh b/scripts/travel_allowances/unzipCaller.sh new file mode 100755 index 0000000..ec6088a --- /dev/null +++ b/scripts/travel_allowances/unzipCaller.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# This script only calls unzip.sh for all months. + +./unzip.sh 2015-12 +./unzip.sh 2015-11 +./unzip.sh 2015-10 +./unzip.sh 2015-09 +./unzip.sh 2015-08 +./unzip.sh 2015-07 +./unzip.sh 2015-06 +./unzip.sh 2015-05 +./unzip.sh 2015-04 +./unzip.sh 2015-03 +./unzip.sh 2015-02 +./unzip.sh 2015-01 + +./unzip.sh 2014-12 +./unzip.sh 2014-11 +./unzip.sh 2014-10 +./unzip.sh 2014-09 +./unzip.sh 2014-08 +./unzip.sh 2014-07 +./unzip.sh 2014-06 +./unzip.sh 2014-05 +./unzip.sh 2014-04 +./unzip.sh 2014-03 +./unzip.sh 2014-02 +./unzip.sh 2014-01 + +./unzip.sh 2013-12 +./unzip.sh 2013-11 +./unzip.sh 2013-10 +./unzip.sh 2013-09 +./unzip.sh 2013-08 +./unzip.sh 2013-07 +./unzip.sh 2013-06 +./unzip.sh 2013-05 +./unzip.sh 2013-04 +./unzip.sh 2013-03 +./unzip.sh 2013-02 +./unzip.sh 2013-01 + +./unzip.sh 2012-12 +./unzip.sh 2012-11 +./unzip.sh 2012-10 +./unzip.sh 2012-09 +./unzip.sh 2012-08 +./unzip.sh 2012-07 +./unzip.sh 2012-06 +./unzip.sh 2012-05 +./unzip.sh 2012-04 +./unzip.sh 2012-03 +./unzip.sh 2012-02 +./unzip.sh 2012-01 + +./unzip.sh 2011-12 +./unzip.sh 2011-11 +./unzip.sh 2011-10 +./unzip.sh 2011-09 +./unzip.sh 2011-08 +./unzip.sh 2011-07 +./unzip.sh 2011-06 +./unzip.sh 2011-05 +./unzip.sh 2011-04 +./unzip.sh 2011-03 +./unzip.sh 2011-02 +./unzip.sh 2011-01 -- GitLab