Commit d88d277a authored by Cristian Weiland's avatar Cristian Weiland

Issue #33: Workers: ready to add entire ministry.

Signed-off-by: Cristian Weiland's avatarCristian Weiland <cw14@inf.ufpr.br>
parent 1a480b08
#!/bin/bash
input="201612_GastosDiretos.csv"
if [ "$#" -ne 4 ]; then
echo "Usage $0 <path> <date> <filter> <column-id>"
echo "Example: $0 ./tmp_201612 201612 MEC 2"
......
#!/bin/bash
input="201612_GastosDiretos.csv"
if [ "$#" -ne 4 ]; then
echo "Usage $0 <path> <date> <filter> <column-id>"
echo "Example: $0 ./tmp_201612 201612 MEC 2"
......
{
"path": "Dados_Servidores/2016-10/"
"path": "tmp_2016-10/"
, "date": "20161031"
, "file1" : "_Remuneracao.csv"
, "file2" : "_Cadastro_Unique.csv"
......@@ -8,5 +8,5 @@
, "quotechar": "\""
, "delimiter": "\t"
, "lineterminator": "\n"
, "outputFile": "Dados_Servidores/Processados/201610.csv"
, "outputFile": "tmp_/201610.csv"
}
......@@ -2,51 +2,18 @@
# Index prefix: The prefix of the index in elasticsearch. Ex: gastos
index="mec-servidores"
index="servidores"
# Filter: An associative array that will be used on 'egrep' to filter data to get only relevant universities.
# The key must be the university initials and the value should be the university name (equal to its name in Portal Transparencia's csv!).
# ColumnName: The name of the column from the CSV that we will use to filter data.
columnName="ORGSUP_LOTACAO"
# Filter: An associative array that will be used to filter data. The key should be the initials, and they will be used to generate the index name.
# The value should be the same as in the CSV, since it will be used to match data.
declare -A filter
filter=(
[ufal]="UNIVERSIDADE FEDERAL DE ALAGOAS"
[ufba]="UNIVERSIDADE FEDERAL DA BAHIA"
[ufc]="UNIVERSIDADE FEDERAL DO CEARA"
[ufes]="UNIVERSIDADE FEDERAL DO ESPIRITO SANTO"
[ufg]="UNIVERSIDADE FEDERAL DE GOIAS"
[uff]="UNIVERSIDADE FEDERAL FLUMINENSE"
[ufjf]="UNIVERSIDADE FEDERAL DE JUIZ DE FORA"
[ufmg]="UNIVERSIDADE FEDERAL DE MINAS GERAIS"
[ufpa]="UNIVERSIDADE FEDERAL DO PARA"
[ufpb]="UNIVERSIDADE FEDERAL DA PARAIBA"
[ufpr]="UNIVERSIDADE FEDERAL DO PARANA"
[ufpe]="UNIVERSIDADE FEDERAL DE PERNAMBUCO"
[ufrn]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE"
[ufrgs]="UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL"
[ufrj]="UNIVERSIDADE FEDERAL DO RIO DE JANEIRO"
[ufsc]="UNIVERSIDADE FEDERAL DE SANTA CATARINA"
[ufsm]="UNIVERSIDADE FEDERAL DE SANTA MARIA"
[ufrpe]="UNIVERSIDADE FEDERAL RURAL DE PERNAMBUCO"
[ufrrj]="UNIVERSIDADE FEDERAL RURAL DO RIO DE JANEIRO"
[ufrr]="UNIVERSIDADE FEDERAL DE RORAIMA"
[ufcg]="UNIVERSIDADE FEDERAL DE CAMPINA GRANDE"
[ufra]="UNIVERSIDADE FEDERAL RURAL DA AMAZONIA"
[uftm]="UNIVERSIDADE FEDERAL DO TRIANGULO MINEIRO"
[ufvjm]="UNIVERSIDADE FED.VALES JEQUITINHONHA E MUCURI"
[utfpr]="UNIVERSIDADE TECNOLOGICA FEDERAL DO PARANA"
[unifal]="UNIVERSIDADE FEDERAL DE ALFENAS"
[unifei]="UNIVERSIDADE FEDERAL DE ITAJUBA - MG"
[unifesp]="UNIVERSIDADE FEDERAL DE SÃO PAULO"
[ufla]="UNIVERSIDADE FEDERAL DE LAVRAS"
[ufersa]="UNIVERSIDADE FEDERAL RURAL DO SEMI-ARIDO"
[unirio]="UNIVERSIDADE FEDERAL DO ESTADO RIO DE JANEIRO"
[furg]="UNIVERSIDADE FEDERAL DO RIO GRANDE - FURG"
[ufrb]="UNIVERSIDADE FEDERAL DO RECONCAVO DA BAHIA"
[uffs]="UNIVERSIDADE FEDERAL DA FRONTEIRA SUL"
[ufopa]="UNIVERSIDADE FEDERAL DO OESTE DO PARA"
[ufob]="UNIVERSIDADE FEDERAL DO OESTE DA BAHIA - UFOB"
[ufca]="UNIVERSIDADE FEDERAL DO CARIRI - UFCA"
[ufsb]="UNIVERSIDADE FEDERAL DO SUL DA BAHIA - UFESBA"
[mec]="MINISTERIO DA EDUCACAO"
)
# Host: ElasticSearch's host. Examples: "localhost"
......
......@@ -12,12 +12,12 @@ import sys, csv, json, math, subprocess
from pathlib import Path
from subprocess import call
if len(sys.argv) != 9:
print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>")
if len(sys.argv) != 10:
print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <entity> <username> <password> <path>")
sys.exit()
data = {
"path": "../../data/workers/" + sys.argv[1] + "-" + sys.argv[2] + "/"
"path": sys.argv[9]
, "date": sys.argv[1] + sys.argv[2] + sys.argv[3]
, "file1": "_Remuneracao.csv"
, "file2": "_Cadastro_Unique.csv"
......@@ -26,10 +26,10 @@ data = {
, "quotechar": "\""
, "delimiter": "\t"
, "lineterminator": "\n"
, "outputFile": "../../data/workers/processed/" + sys.argv[1] + sys.argv[2] + ".csv"
, "outputFile": sys.argv[9] + '/' + sys.argv[1] + sys.argv[2] + sys.argv[3] + ".csv"
}
with open('../../configs/workers/json/config-' + sys.argv[1] + '-' + sys.argv[2] + '.json', 'w') as outfile:
with open(sys.argv[9] + '/config-' + sys.argv[1] + '-' + sys.argv[2] + '.json', 'w') as outfile:
json.dump(data, outfile, indent=4, sort_keys=True)
if int(sys.argv[1]) <= 2014 or (int(sys.argv[1]) == 2015 and int(sys.argv[2]) <= 3):
......@@ -41,11 +41,10 @@ else:
output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
, "date": sys.argv[1] + '-' + sys.argv[2]
, "index": sys.argv[4]
, "index": sys.argv[4] + sys.argv[6]
, "host": sys.argv[5]
, "university": sys.argv[6]
, "user": sys.argv[7]
, "password": sys.argv[8] }
with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
with open(sys.argv[9] + '/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
outfile.write(output)
......@@ -22,7 +22,7 @@ if [ "$#" -ne 4 ]; then
exit
fi
source config.sh
source ./config.sh
if [ -z ${index+x} ]; then
echo "Var 'index' is unset. Set it in file 'scripts/workers/config.sh'.";
......@@ -32,65 +32,63 @@ if [ -z ${host+x} ]; then
echo "Var 'host' is unset. Set it in file 'scripts/workers/config.sh'.";
exit;
fi
if [ -z ${filter+x} ]; then
echo "Var 'filter' is unset. Set it in file 'scripts/workers/config.sh'.";
if [ -z ${columnName+x} ]; then
echo "Var 'columnName' is unset. Set it in file 'scripts/workers/config.sh'.";
exit;
fi
if [ -z ${university+x} ]; then
echo "Var 'university' is unset. Set it in file 'scripts/workers/config.sh'.";
size=${#filter[@]}
if [ "$size" -lt 1 ]; then
echo "Var 'filter' is unset. Set it in file 'scripts/expenses/config.sh'.";
exit;
fi
ym=$1-$2
dataPath="../../data/"
path="../../data/workers/"
configPath="../../configs/workers/"
# Check if Data and Workers directories already exist:
if [ ! -d "$path" ]; then
mkdir -p "$path"
fi
if [ ! -d "$configPath/json" ]; then
mkdir -p "$configPath/json"
fi
if [ ! -d "$configPath/logstash" ]; then
mkdir -p "$configPath/logstash"
fi
path="./tmp_$ym"
# Step 1:
# Create directory to store files
mkdir -p $path$ym
mkdir -p ${path}processed/
mkdir -p "$path"
# Download files
request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&d=C&consulta=Servidores'
curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_ 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Servidores.zip
curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_ 64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path/${1}${2}_Servidores.zip
# Unzip them
unzip -o $path$ym/${1}${2}_Servidores.zip -d $path$ym/
unzip -o $path/${1}${2}_Servidores.zip -d $path/
# Remove zip file
rm $path$ym/${1}${2}_Servidores.zip
rm $path/${1}${2}_Servidores.zip
# Get day
day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8)
day=$(ls $path | grep -m 1 $1$2 | cut -c 7,8)
for key in "${!filter[@]}"
do
# Step 2:
# Create config files
./create_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4
./create_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4 "${path}"
# Step 3:
# Start processing
aux=$( echo "${filter[$key]}" | sed 's/ /\\ /g' )
./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json "$aux"
rm $path$ym/${1}${2}${day}_Cadastro_Unique.csv
./merge_files_es.py $path/config-${1}-${2}.json "$aux" "${columnName}"
rm $path/${1}${2}${day}_Cadastro_Unique.csv
# Step 4:
# Insert data in ElasticSearch
logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv
logstash -f $path/config-${1}-${2} < $path/${1}${2}${day}.csv
# Remove data
rm ../../data/workers/processed/${1}${2}.csv
rm -f $path/config-${1}-${2}
rm -f $path/config-${1}-${2}.json
rm -f $path/${1}${2}${day}.csv
done
rm -f $path/${1}${2}${day}_Afastamentos.csv
rm -f $path/${1}${2}${day}_Cadastro.csv
rm -f $path/${1}${2}${day}_Honorarios\(Jetons\).csv
rm -f $path/${1}${2}${day}_Jetom.csv
rm -f $path/${1}${2}${day}_Observacoes.csv
rm -f $path/${1}${2}${day}_Remuneracao.csv
rmdir $path
......@@ -22,16 +22,16 @@ import sys, csv, json, math, subprocess
from pathlib import Path
from subprocess import call
if len(sys.argv) != 3:
print("Usage: " + sys.argv[0] + " <config.json> <filter>")
if len(sys.argv) != 4:
print("Usage: " + sys.argv[0] + " <config.json> <filter> <columnId>")
sys.exit()
with open(sys.argv[1]) as f:
params = json.load(f)
# Which files should be merged?
file1 = params['path'] + params['date'] + params['file1']
file2 = params['path'] + params['date'] + params['file2']
file1 = params['path'] + '/' + params['date'] + params['file1']
file2 = params['path'] + '/' + params['date'] + params['file2']
# Which column in each file contains the common column?
idPointColumn1 = params['idColumn1']
......@@ -46,7 +46,7 @@ title1 = csv_1.pop(0)
file_exists = Path(file2)
if not file_exists.is_file():
print("File2 does not exist. Calling script resume_register to create it...")
call(["./resume_register.sh " + params['path'] + " " + params['date'] + " " + sys.argv[2]], shell=True)
call(["./resume_register.sh " + params['path'] + " " + params['date'] + " " + sys.argv[2] + " " + sys.argv[3]], shell=True)
with open(file2, newline='', encoding='Windows-1252') as f:
csv_2 = [ i for i in csv.reader(f, 'dialect') ]
......
......@@ -2,33 +2,28 @@
# WARNING: This script should not be called if you dont know what you're doing! Look for 'merge_files_es.py'.
# This scripts purpose is to filter data and get only data related to UFPR.
# Input: Path to data files and date from data files.
# Example (inserting data from 2016-10): ./resume_register.sh ../../data/workers/2016-10/ 20161031
# Output: CSV file named YearMonthDay_Cadastro_Ufpr_Unique.csv, in the $path folder.
# Example of CSV location (using same parameters as input): ../../data/workers/2016-10/20161031_Cadastro_Ufpr_Unique.csv
input="${path}${date}_Cadastro.csv"
output="${path}${date}_Cadastro_Unique.csv"
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <path> <date> <filter>"
exit
if [ "$#" -ne 4 ]; then
echo "Usage $0 <path> <date> <filter> <column-name>"
echo "Example: $0 ./tmp_201612 201612 MEC 2"
exit
fi
path=$1
date=$2
filter=$3
input="${path}${date}_Cadastro.csv"
output="${path}${date}_Cadastro_Unique.csv"
input="${path}/${date}_Cadastro.csv"
output="${path}/${date}_Cadastro_Unique.csv"
if [ ! -d "${path}" ]; then
mkdir -p "${path}"
fi
head -n1 ${input} > $path/header.csv
iconv -f WINDOWS-1252 -t UTF-8 -o $path/tmp.csv $path/header.csv
columnId=$(sed s/${4}.*$/${4}/ $path/tmp.csv | sed -e 's/\t/\n/g' | wc -l)
columnId=`expr $columnId + 1`
rm -f $path/tmp.csv $path/header.csv
# About this command:
# - Grep removes everyone that does not work in UFPR. -w option forces to match the whole word, to avoid "UNIVERSIDADE FEDERAL DO PARA" from matching with "UNIVERSIDADE FEDERAL DO PARANA"
# - Tr removes null characters (ctrl + @).
cmd="\$$columnId == \"${filter}\""
# Get only data from UFPR.
cat "$input" | egrep -w --binary-files=text "$filter" | tr -d '\000' > "$output"
cat "${input}" | awk -F $'\t' "$cmd" | tr -d '\000' > "$output"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment