From 5d2b38e9c7a8170ca57d08bd0888c2215fb4c52f Mon Sep 17 00:00:00 2001
From: Cristian Weiland <cw14@inf.ufpr.br>
Date: Tue, 14 Mar 2017 11:20:03 -0300
Subject: [PATCH] Issue #19: Add scripts to insert travel allowance data

Signed-off-by: Cristian Weiland <cw14@inf.ufpr.br>
---
 scripts/travel_allowances/README              |  7 ++
 .../create_travel_allowance_config.py         | 26 +++++++
 scripts/travel_allowances/get_last_day.sh     |  5 ++
 .../insert_travel_allowances.sh               | 62 +++++++++++++++++
 .../travel_allowances/logstash_config.example | 49 +++++++++++++
 .../process_travel_allowances.sh              | 64 +++++++++++++++++
 .../resume_travel_allowance.sh                | 33 +++++++++
 scripts/travel_allowances/unzip.sh            | 21 ++++++
 scripts/travel_allowances/unzipCaller.sh      | 68 +++++++++++++++++++
 9 files changed, 335 insertions(+)
 create mode 100644 scripts/travel_allowances/README
 create mode 100755 scripts/travel_allowances/create_travel_allowance_config.py
 create mode 100755 scripts/travel_allowances/get_last_day.sh
 create mode 100755 scripts/travel_allowances/insert_travel_allowances.sh
 create mode 100644 scripts/travel_allowances/logstash_config.example
 create mode 100755 scripts/travel_allowances/process_travel_allowances.sh
 create mode 100755 scripts/travel_allowances/resume_travel_allowance.sh
 create mode 100755 scripts/travel_allowances/unzip.sh
 create mode 100755 scripts/travel_allowances/unzipCaller.sh

diff --git a/scripts/travel_allowances/README b/scripts/travel_allowances/README
new file mode 100644
index 0000000..165b1f6
--- /dev/null
+++ b/scripts/travel_allowances/README
@@ -0,0 +1,7 @@
+The easiest way to insert travel allowance data is to use 'insert_travel_allowance.sh'.
+
+Script's input: Year and month from the data to be inserted, ElasticSearch's user and password.
+Example: ./insert_travel_allowance.sh 2016 10 myuser mypass
+Example 2: ./insert_travel_allowance.sh 2014 11 myuser mypass
+
+The other script's will be called by 'insert_travel_allowance.sh' correctly.
diff --git a/scripts/travel_allowances/create_travel_allowance_config.py b/scripts/travel_allowances/create_travel_allowance_config.py
new file mode 100755
index 0000000..10e00d0
--- /dev/null
+++ b/scripts/travel_allowances/create_travel_allowance_config.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# WARNING: This script should not be called directly. Look at 'insert_travel_allowance.sh' before calling this script.
+
+# This script is used to create a Logstash Config file.
+
+# Input: year, month and day, ElasticSearch's username and password.
+
+import sys, csv, json, math, subprocess
+from pathlib import Path
+from subprocess import call
+
+if len(sys.argv) != 6:
+    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>")
+    sys.exit()
+
+with open('logstash_config.example') as infile:
+	example = infile.read()
+
+output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
+					 , "date": sys.argv[1] + '-' + sys.argv[2]
+					 , "user": sys.argv[4]
+					 , "password": sys.argv[5] }
+
+with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
+	outfile.write(output)
diff --git a/scripts/travel_allowances/get_last_day.sh b/scripts/travel_allowances/get_last_day.sh
new file mode 100755
index 0000000..7d5abe5
--- /dev/null
+++ b/scripts/travel_allowances/get_last_day.sh
@@ -0,0 +1,5 @@
+ym=$(date +%Y%m -d "$(date +%Y%m15) next month")
+temp=$(date -d "${ym}01")
+d=$(date -d "$temp - 1 day" "+%d")
+
+echo $d
diff --git a/scripts/travel_allowances/insert_travel_allowances.sh b/scripts/travel_allowances/insert_travel_allowances.sh
new file mode 100755
index 0000000..d850b38
--- /dev/null
+++ b/scripts/travel_allowances/insert_travel_allowances.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# This script is the one that should be called to insert data from one month.
+
+# Input: Year, month and day from the data to be inserted, ElasticSearch's user and password. The day should be the last day of the month.
+# Example: ./insert_travel_allowance.sh 2016 10 myuser mypass
+# It has 4 steps:
+#   1- Download files and put them in the right location.
+#   2- Generate logstash config file via create_travel_allowance_config.py.
+#   3- Generate a CSV with only UFPR data via resume_travel_allowance.sh, which is stored in transparencia/data/travel_allowance/processed/year-month.csv
+#   4- Insert data in ElasticSearch via logstash, using the config file created and the CSV created by resume_travel_allowance.sh.
+# Output: The commands/scripts outputs.
+
+if [ "$#" -ne 4 ]; then
+	echo "Usage: $0 <year> <month> <user> <password>"
+	echo "Example: $0 2016 12 myuser mypass"
+	exit
+fi
+
+# Getting the Last day of this month (Using date 2016-05-15 as example):
+# First, get next month (201606).
+aux=$(date +%Y%m -d "$(date +%Y%m15) next month")
+# Append day 01 (20160601).
+temp=$(date -d "${aux}01")
+# Remove 1 day: 20160531, get only day: 31.
+day=$(date -d "$temp - 1 day" "+%d")
+
+ym=$1-$2
+dataPath="../../data/"
+path="../../data/travel_allowance/"
+configPath="../../configs/travel_allowance/logstash/"
+
+if [ ! -d "$dataPath" ]; then
+	mkdir "$dataPath"
+fi
+if [ ! -d "$path/processed" ]; then
+	mkdir -p "$path/processed"
+fi
+if [ ! -d "$configPath" ]; then
+	mkdir -p "$configPath"
+fi
+
+# Step 1:
+# Create directory to store files
+mkdir $path$ym
+
+# Download files
+request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&consulta=Diarias'
+curl -o $path$ym/${1}${2}_Diarias.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed
+
+# Unzip them
+unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/
+
+# Remove zip file
+rm $path$ym/${1}${2}_Diarias.zip
+
+# Step 2:
+./create_travel_allowance_config.py $1 $2 $day $3 $4
+# Step 3:
+./resume_travel_allowance.sh $path ${1}-${2}
+# Step 4:
+logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv
diff --git a/scripts/travel_allowances/logstash_config.example b/scripts/travel_allowances/logstash_config.example
new file mode 100644
index 0000000..50afa9d
--- /dev/null
+++ b/scripts/travel_allowances/logstash_config.example
@@ -0,0 +1,49 @@
+input {
+	stdin {
+		codec => plain {
+			charset => "Windows-1252"
+		}
+	}
+}
+
+filter {
+	csv {
+		columns => ["Código Órgão Superior","Nome Órgão Superior","Código Órgão","Nome Órgao","Código Unidade Gestora","Nome Unidade Gestora","Código Função","Nome Função","Código Subfunção","Nome Subfunção","Código Programa","Nome Programa","Código Ação","Nome Ação","Linguagem Cidadã","CPF Favorecido","Nome Favorecido","Número Documento","Gestão Pagamento","Data Pagamento","Valor"]
+		separator => "	"
+		add_field => { "timestamp" => "%(timestamp)s" }
+	}
+	mutate {
+		convert => { "Código Órgão Superior" => "integer" }
+		convert => { "Código Órgão" => "integer" }
+		convert => { "Código Unidade Gestora" => "integer" }
+		convert => { "Código Grupo Despesa" => "integer" }
+		convert => { "Código Elemento Despesa" => "integer" }
+		convert => { "Código Função" => "integer" }
+		convert => { "Código Subfunção" => "integer" }
+		convert => { "Código Programa" => "integer" }
+		convert => { "Código Ação" => "integer" }
+		convert => { "Código Favorecido" => "integer" }
+		convert => { "Gestão Pagamento" => "integer" }
+		convert => { "Valor" => "float" }
+	}
+	date {
+		match => [ "timestamp", "dd/MM/YYYY HH:mm:ss", "ISO8601" ]
+		target => [ "@timestamp" ]
+	}
+	date {
+        match => [ "Data Pagamento", "dd/MM/YYYY" ]
+        target => [ "Data Pagamento Timestamp" ]
+	}
+}
+
+output {
+	elasticsearch {
+		action => "index"
+		user => "%(user)s"
+		password => "%(password)s"
+		hosts => "localhost:9200"
+		index => "ufpr-gastos-diarias-%(date)s"
+		workers => 1
+	}
+	stdout {}
+}
diff --git a/scripts/travel_allowances/process_travel_allowances.sh b/scripts/travel_allowances/process_travel_allowances.sh
new file mode 100755
index 0000000..fc7bd7d
--- /dev/null
+++ b/scripts/travel_allowances/process_travel_allowances.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# WARNING: This script should not be called unless the database is erased. Its still here for 2 reasons:
+# 1- Log: To know what months of data have been inserted.
+# 2- Example: To give example of how to call script insert_travel_allowance.sh.
+
+# This script only calls insert_travel_allowance for all years and months.
+
+if [ "$#" -ne 2 ]; then
+	echo "Usage: $0 <user> <password>"
+	echo "Example: $0 myuser mypass"
+	exit
+fi
+
+./insert_travel_allowance.sh 2016 11 $1 $2
+./insert_travel_allowance.sh 2016 10 $1 $2
+./insert_travel_allowance.sh 2016 09 $1 $2
+./insert_travel_allowance.sh 2016 08 $1 $2
+./insert_travel_allowance.sh 2016 07 $1 $2
+./insert_travel_allowance.sh 2016 06 $1 $2
+./insert_travel_allowance.sh 2016 05 $1 $2
+./insert_travel_allowance.sh 2016 04 $1 $2
+./insert_travel_allowance.sh 2016 03 $1 $2
+./insert_travel_allowance.sh 2016 02 $1 $2
+./insert_travel_allowance.sh 2016 01 $1 $2
+
+./insert_travel_allowance.sh 2015 12 $1 $2
+./insert_travel_allowance.sh 2015 11 $1 $2
+./insert_travel_allowance.sh 2015 10 $1 $2
+./insert_travel_allowance.sh 2015 09 $1 $2
+./insert_travel_allowance.sh 2015 08 $1 $2
+./insert_travel_allowance.sh 2015 07 $1 $2
+./insert_travel_allowance.sh 2015 06 $1 $2
+./insert_travel_allowance.sh 2015 05 $1 $2
+./insert_travel_allowance.sh 2015 04 $1 $2
+./insert_travel_allowance.sh 2015 03 $1 $2
+./insert_travel_allowance.sh 2015 02 $1 $2
+./insert_travel_allowance.sh 2015 01 $1 $2
+
+./insert_travel_allowance.sh 2014 12 $1 $2
+./insert_travel_allowance.sh 2014 11 $1 $2
+./insert_travel_allowance.sh 2014 10 $1 $2
+./insert_travel_allowance.sh 2014 09 $1 $2
+./insert_travel_allowance.sh 2014 08 $1 $2
+./insert_travel_allowance.sh 2014 07 $1 $2
+./insert_travel_allowance.sh 2014 06 $1 $2
+./insert_travel_allowance.sh 2014 05 $1 $2
+./insert_travel_allowance.sh 2014 04 $1 $2
+./insert_travel_allowance.sh 2014 03 $1 $2
+./insert_travel_allowance.sh 2014 02 $1 $2
+./insert_travel_allowance.sh 2014 01 $1 $2
+
+./insert_travel_allowance.sh 2013 12 $1 $2
+./insert_travel_allowance.sh 2013 11 $1 $2
+./insert_travel_allowance.sh 2013 10 $1 $2
+./insert_travel_allowance.sh 2013 09 $1 $2
+./insert_travel_allowance.sh 2013 08 $1 $2
+./insert_travel_allowance.sh 2013 07 $1 $2
+./insert_travel_allowance.sh 2013 06 $1 $2
+./insert_travel_allowance.sh 2013 05 $1 $2
+./insert_travel_allowance.sh 2013 04 $1 $2
+./insert_travel_allowance.sh 2013 03 $1 $2
+./insert_travel_allowance.sh 2013 02 $1 $2
+./insert_travel_allowance.sh 2013 01 $1 $2
diff --git a/scripts/travel_allowances/resume_travel_allowance.sh b/scripts/travel_allowances/resume_travel_allowance.sh
new file mode 100755
index 0000000..22c302a
--- /dev/null
+++ b/scripts/travel_allowances/resume_travel_allowance.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# WARNING: This script should not be called directly. Look at 'insert_travel_allowance.sh' before calling this script.
+
+# Input: First parameter is the path to data files and the second one is the date in the name of the files. Data files can be found in: http://transparencia.gov.br/downloads/mensal.asp?c=Diarias
+# Example: ./resume_travel_allowance.sh ../../data/travel_allowance/ 2016-11
+
+# Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR).
+
+# Path example: ../../data/travel_allowance/
+path=$1
+# Date example: 2016-11
+date=$2
+# dateWithoutHyphen example: 201611
+dateWithoutHyphen=${date//-}
+
+if [ "$#" -ne 2 ]; then
+	echo "Usage: $0 <path> <date>"
+	exit
+fi
+
+echo "Processing data with args = $path and ${date}"
+
+input="${path}${date}/${dateWithoutHyphen}_Diarias.csv"
+output="${path}processed/${dateWithoutHyphen}.csv"
+
+# About this command:
+# - Grep removes everyone that does not work in UFPR.
+# - Tr removes null characters (ctrl + @).
+# - Head -n1 gets first line (column names). Then, I append the data.
+
+head -n1 $input > $output
+cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output
diff --git a/scripts/travel_allowances/unzip.sh b/scripts/travel_allowances/unzip.sh
new file mode 100755
index 0000000..5778a05
--- /dev/null
+++ b/scripts/travel_allowances/unzip.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# This scripts gets a zip file in ~/Downloads, moves it to a folder in path (probably transparencia/data/travel_allowance), unzips it and removes the zip file.
+
+# Input: Date (year and month, separated by hyphen).
+# Ex: ./unzip.sh 2015-12
+
+if [ "$#" -ne 2 ]; then
+	echo "Usage $0 <date>"
+	exit
+fi
+
+date=$1
+path="../../data/travel_allowance/"
+dateWithoutHyphen=${date//-}
+
+mkdir $path$date
+mv ~/Downloads/${dateWithoutHyphen}_Diarias.zip $path$date
+unzip $path$date/${dateWithoutHyphen}_Diarias.zip
+mv $path${dateWithoutHyphen}_Diarias.csv $path$date
+rm $path$date/${dateWithoutHyphen}_Diarias.zip
diff --git a/scripts/travel_allowances/unzipCaller.sh b/scripts/travel_allowances/unzipCaller.sh
new file mode 100755
index 0000000..ec6088a
--- /dev/null
+++ b/scripts/travel_allowances/unzipCaller.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# This script only calls unzip.sh for all months.
+
+./unzip.sh 2015-12
+./unzip.sh 2015-11
+./unzip.sh 2015-10
+./unzip.sh 2015-09
+./unzip.sh 2015-08
+./unzip.sh 2015-07
+./unzip.sh 2015-06
+./unzip.sh 2015-05
+./unzip.sh 2015-04
+./unzip.sh 2015-03
+./unzip.sh 2015-02
+./unzip.sh 2015-01
+
+./unzip.sh 2014-12
+./unzip.sh 2014-11
+./unzip.sh 2014-10
+./unzip.sh 2014-09
+./unzip.sh 2014-08
+./unzip.sh 2014-07
+./unzip.sh 2014-06
+./unzip.sh 2014-05
+./unzip.sh 2014-04
+./unzip.sh 2014-03
+./unzip.sh 2014-02
+./unzip.sh 2014-01
+
+./unzip.sh 2013-12
+./unzip.sh 2013-11
+./unzip.sh 2013-10
+./unzip.sh 2013-09
+./unzip.sh 2013-08
+./unzip.sh 2013-07
+./unzip.sh 2013-06
+./unzip.sh 2013-05
+./unzip.sh 2013-04
+./unzip.sh 2013-03
+./unzip.sh 2013-02
+./unzip.sh 2013-01
+
+./unzip.sh 2012-12
+./unzip.sh 2012-11
+./unzip.sh 2012-10
+./unzip.sh 2012-09
+./unzip.sh 2012-08
+./unzip.sh 2012-07
+./unzip.sh 2012-06
+./unzip.sh 2012-05
+./unzip.sh 2012-04
+./unzip.sh 2012-03
+./unzip.sh 2012-02
+./unzip.sh 2012-01
+
+./unzip.sh 2011-12
+./unzip.sh 2011-11
+./unzip.sh 2011-10
+./unzip.sh 2011-09
+./unzip.sh 2011-08
+./unzip.sh 2011-07
+./unzip.sh 2011-06
+./unzip.sh 2011-05
+./unzip.sh 2011-04
+./unzip.sh 2011-03
+./unzip.sh 2011-02
+./unzip.sh 2011-01
-- 
GitLab