Skip to content
Snippets Groups Projects
Commit 5f970c9e authored by Cristian Weiland's avatar Cristian Weiland
Browse files

Issue #25: Fix bug getting last day of month in script and add config file


The config file has the following variables:
- Host: Host running ElasticSearch. Must not contain "http://".
- Index: The name of the index in which data will be inserted.
- Filter: The filter to get only part of Portal Transparencia's data.
Scripts were getting always the last day of the current month instead of the last day of the parameter's month.

Signed-off-by: default avatarCristian Weiland <cw14@inf.ufpr.br>
parent 26244a79
No related branches found
No related tags found
No related merge requests found
Showing
with 127 additions and 51 deletions
# This file only contains some config variables:
# Index prefix: The prefix of the index in elasticsearch. Ex: gastos
index="gastos-pagamentos"
# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
# Ex: Getting only UFPR:
# filter="UNIVERSIDADE FEDERAL DO PARANA"
# Getting UFPR and UFMG:
# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
# Getting all universities:
# filter="UNIVERSIDADE FEDERAL*"
filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
# Host: ElasticSearch's host. Ex: "localhost"
host="localhost"
...@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess ...@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess
from pathlib import Path from pathlib import Path
from subprocess import call from subprocess import call
if len(sys.argv) != 6: if len(sys.argv) != 8:
print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>") print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
sys.exit() sys.exit()
with open('logstash_config.example') as infile: with open('logstash_config.example') as infile:
...@@ -19,8 +19,10 @@ with open('logstash_config.example') as infile: ...@@ -19,8 +19,10 @@ with open('logstash_config.example') as infile:
output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
, "date": sys.argv[1] + '-' + sys.argv[2] , "date": sys.argv[1] + '-' + sys.argv[2]
, "user": sys.argv[4] , "index": sys.argv[4]
, "password": sys.argv[5] } , "host": sys.argv[5]
, "user": sys.argv[6]
, "password": sys.argv[7] }
with open('../../configs/expenses/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: with open('../../configs/expenses/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
outfile.write(output) outfile.write(output)
...@@ -19,7 +19,7 @@ fi ...@@ -19,7 +19,7 @@ fi
# Getting the Last day of this month (Using date 2016-05-15 as example): # Getting the Last day of this month (Using date 2016-05-15 as example):
# First, get next month (201606). # First, get next month (201606).
aux=$(date +%Y%m -d "$(date +%Y%m15) next month") aux=$(date +%Y%m -d "$(date +${1}${2}15) next month")
# Append day 01 (20160601). # Append day 01 (20160601).
temp=$(date -d "${aux}01") temp=$(date -d "${aux}01")
# Remove 1 day: 20160531, get only day: 31. # Remove 1 day: 20160531, get only day: 31.
...@@ -50,9 +50,13 @@ unzip $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/ ...@@ -50,9 +50,13 @@ unzip $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/
# Remove zip file # Remove zip file
rm $path$ym/${1}${2}_GastosDiretos.zip rm $path$ym/${1}${2}_GastosDiretos.zip
source config.sh
# Step 2: # Step 2:
./create_expenses_config.py $1 $2 $day $3 $4 ./create_expenses_config.py $1 $2 $day $index $host $3 $4
# Step 3: # Step 3:
./resume_expenses.sh ../../data/expenses/ ${1}-${2} ./resume_expenses.sh ../../data/expenses/ ${1}-${2} $filter
# Step 4: # Step 4:
logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv
# Data inserted, we can now remove it.
rm ../../data/expenses/processed/${1}${2}.csv
...@@ -41,8 +41,8 @@ output { ...@@ -41,8 +41,8 @@ output {
action => "index" action => "index"
user => "%(user)s" user => "%(user)s"
password => "%(password)s" password => "%(password)s"
hosts => "localhost:9200" hosts => "http://%(host)s:9200"
index => "ufpr-gastos-pagamentos-%(date)s" index => "ufpr-%(index)s-%(date)s"
workers => 1 workers => 1
} }
stdout {} stdout {}
......
...@@ -7,18 +7,20 @@ ...@@ -7,18 +7,20 @@
# Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR). # Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR).
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <path> <date> <filter>"
exit
fi
# Path example: ../../data/expenses/ # Path example: ../../data/expenses/
path=$1 path=$1
# Date example: 2016-11 # Date example: 2016-11
date=$2 date=$2
# Filter example: UNIVERSIDADE FEDERAL DO PARANA
filter=$3
# dateWithoutHyphen example: 201611 # dateWithoutHyphen example: 201611
dateWithoutHyphen=${date//-} dateWithoutHyphen=${date//-}
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <path> <date>"
exit
fi
echo "Processing data with args = $path and ${date}" echo "Processing data with args = $path and ${date}"
input="${path}${date}/${dateWithoutHyphen}_GastosDiretos.csv" input="${path}${date}/${dateWithoutHyphen}_GastosDiretos.csv"
...@@ -27,7 +29,6 @@ output="${path}processed/${dateWithoutHyphen}.csv" ...@@ -27,7 +29,6 @@ output="${path}processed/${dateWithoutHyphen}.csv"
# About this command: # About this command:
# - Grep removes everyone that does not work in UFPR. # - Grep removes everyone that does not work in UFPR.
# - Tr removes null characters (ctrl + @). # - Tr removes null characters (ctrl + @).
# - Head -n1 gets first line (column names). Then, I append the data.
head -n1 $input > $output cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output rm $input
# This file only contains some config variables:
# Index prefix: The prefix of the index in elasticsearch. Ex: gastos
index="gastos-diarias"
# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
# Ex: Getting only UFPR:
# filter="UNIVERSIDADE FEDERAL DO PARANA"
# Getting UFPR and UFMG:
# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
# Getting all universities:
# filter="UNIVERSIDADE FEDERAL*"
filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
# Host: ElasticSearch's host. Examples: "localhost"
host="localhost"
...@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess ...@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess
from pathlib import Path from pathlib import Path
from subprocess import call from subprocess import call
if len(sys.argv) != 6: if len(sys.argv) != 8:
print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>") print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
sys.exit() sys.exit()
with open('logstash_config.example') as infile: with open('logstash_config.example') as infile:
...@@ -19,8 +19,10 @@ with open('logstash_config.example') as infile: ...@@ -19,8 +19,10 @@ with open('logstash_config.example') as infile:
output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
, "date": sys.argv[1] + '-' + sys.argv[2] , "date": sys.argv[1] + '-' + sys.argv[2]
, "user": sys.argv[4] , "index": sys.argv[4]
, "password": sys.argv[5] } , "host": sys.argv[5]
, "user": sys.argv[6]
, "password": sys.argv[7] }
with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
outfile.write(output) outfile.write(output)
...@@ -19,7 +19,7 @@ fi ...@@ -19,7 +19,7 @@ fi
# Getting the Last day of this month (Using date 2016-05-15 as example): # Getting the Last day of this month (Using date 2016-05-15 as example):
# First, get next month (201606). # First, get next month (201606).
aux=$(date +%Y%m -d "$(date +%Y%m15) next month") aux=$(date +%Y%m -d "$(date +${1}${2}15) next month")
# Append day 01 (20160601). # Append day 01 (20160601).
temp=$(date -d "${aux}01") temp=$(date -d "${aux}01")
# Remove 1 day: 20160531, get only day: 31. # Remove 1 day: 20160531, get only day: 31.
...@@ -30,6 +30,8 @@ dataPath="../../data/" ...@@ -30,6 +30,8 @@ dataPath="../../data/"
path="../../data/travel_allowance/" path="../../data/travel_allowance/"
configPath="../../configs/travel_allowance/logstash/" configPath="../../configs/travel_allowance/logstash/"
source config.sh
if [ ! -d "$dataPath" ]; then if [ ! -d "$dataPath" ]; then
mkdir "$dataPath" mkdir "$dataPath"
fi fi
...@@ -55,8 +57,11 @@ unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/ ...@@ -55,8 +57,11 @@ unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/
rm $path$ym/${1}${2}_Diarias.zip rm $path$ym/${1}${2}_Diarias.zip
# Step 2: # Step 2:
./create_travel_allowance_config.py $1 $2 $day $3 $4 ./create_travel_allowance_config.py $1 $2 $day $index $host $3 $4
# Step 3: # Step 3:
./resume_travel_allowance.sh $path ${1}-${2} ./resume_travel_allowance.sh $path ${1}-${2} $filter
# Step 4: # Step 4:
logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv
# Remove processed file
rm ${path}processed/${1}${2}.csv
...@@ -41,8 +41,8 @@ output { ...@@ -41,8 +41,8 @@ output {
action => "index" action => "index"
user => "%(user)s" user => "%(user)s"
password => "%(password)s" password => "%(password)s"
hosts => "localhost:9200" hosts => "http://%(host)s:9200"
index => "ufpr-gastos-diarias-%(date)s" index => "ufpr-%(index)s-%(date)s"
workers => 1 workers => 1
} }
stdout {} stdout {}
......
...@@ -14,8 +14,8 @@ date=$2 ...@@ -14,8 +14,8 @@ date=$2
# dateWithoutHyphen example: 201611 # dateWithoutHyphen example: 201611
dateWithoutHyphen=${date//-} dateWithoutHyphen=${date//-}
if [ "$#" -ne 2 ]; then if [ "$#" -ne 3 ]; then
echo "Usage: $0 <path> <date>" echo "Usage: $0 <path> <date> <filter>"
exit exit
fi fi
...@@ -27,7 +27,5 @@ output="${path}processed/${dateWithoutHyphen}.csv" ...@@ -27,7 +27,5 @@ output="${path}processed/${dateWithoutHyphen}.csv"
# About this command: # About this command:
# - Grep removes everyone that does not work in UFPR. # - Grep removes everyone that does not work in UFPR.
# - Tr removes null characters (ctrl + @). # - Tr removes null characters (ctrl + @).
# - Head -n1 gets first line (column names). Then, I append the data.
head -n1 $input > $output cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output
# This file only contains some config variables:
# Index prefix: The prefix of the index in elasticsearch. Ex: gastos
index="servidores"
# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
# Ex: Getting only UFPR:
# filter="UNIVERSIDADE FEDERAL DO PARANA"
# Getting UFPR and UFMG:
# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
# Getting all universities:
# filter="UNIVERSIDADE FEDERAL*"
filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
# Host: ElasticSearch's host. Examples: "localhost"
host="localhost"
...@@ -12,8 +12,8 @@ import sys, csv, json, math, subprocess ...@@ -12,8 +12,8 @@ import sys, csv, json, math, subprocess
from pathlib import Path from pathlib import Path
from subprocess import call from subprocess import call
if len(sys.argv) != 6: if len(sys.argv) != 8:
print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>") print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
sys.exit() sys.exit()
data = { data = {
...@@ -41,8 +41,10 @@ else: ...@@ -41,8 +41,10 @@ else:
output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
, "date": sys.argv[1] + '-' + sys.argv[2] , "date": sys.argv[1] + '-' + sys.argv[2]
, "user": sys.argv[4] , "index": sys.argv[4]
, "password": sys.argv[5] } , "host": sys.argv[5]
, "user": sys.argv[6]
, "password": sys.argv[7] }
with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
outfile.write(output) outfile.write(output)
...@@ -26,6 +26,8 @@ ym=$1-$2 ...@@ -26,6 +26,8 @@ ym=$1-$2
dataPath="../../data/" dataPath="../../data/"
path="../../data/workers/" path="../../data/workers/"
source config.sh
# Check if Data and Workers directories already exist: # Check if Data and Workers directories already exist:
if [ ! -d "$dataPath" ]; then if [ ! -d "$dataPath" ]; then
mkdir "$dataPath" mkdir "$dataPath"
...@@ -51,15 +53,17 @@ rm $path$ym/${1}${2}_Servidores.zip ...@@ -51,15 +53,17 @@ rm $path$ym/${1}${2}_Servidores.zip
# Get day # Get day
day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8) day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8)
# Step 2: # Step 2:
# Create config files # Create config files
./create_config.py $1 $2 $day $3 $4 ./create_config.py $1 $2 $day $index $host $3 $4
# Step 3: # Step 3:
# Start processing # Start processing
./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json ./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json $filter
# Step 4: # Step 4:
# Insert data in ElasticSearch # Insert data in ElasticSearch
logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv
# Remove data
rm ../../data/workers/processed/${1}${2}.csv
...@@ -58,8 +58,8 @@ output { ...@@ -58,8 +58,8 @@ output {
action => "index" action => "index"
user => "%(user)s" user => "%(user)s"
password => "%(password)s" password => "%(password)s"
hosts => "http://node1.c3sl.ufpr.br:9200" hosts => "http://$(host):9200"
index => "ufpr-servidores-%(date)s" index => "ufpr-$(index)s-%(date)s"
workers => 1 workers => 1
} }
stdout {} stdout {}
......
...@@ -58,8 +58,8 @@ output { ...@@ -58,8 +58,8 @@ output {
action => "index" action => "index"
user => "%(user)s" user => "%(user)s"
password => "%(password)s" password => "%(password)s"
hosts => "http://node1.c3sl.ufpr.br:9200" hosts => "http://$(host)s:9200"
index => "ufpr-servidores-%(date)s" index => "ufpr-%(index)s-%(date)s"
workers => 1 workers => 1
} }
stdout {} stdout {}
......
...@@ -22,8 +22,8 @@ import sys, csv, json, math, subprocess ...@@ -22,8 +22,8 @@ import sys, csv, json, math, subprocess
from pathlib import Path from pathlib import Path
from subprocess import call from subprocess import call
if len(sys.argv) != 2: if len(sys.argv) != 3:
print("Usage: " + sys.argv[0] + " <config.json>") print("Usage: " + sys.argv[0] + " <config.json> <filter>")
sys.exit() sys.exit()
with open(sys.argv[1]) as f: with open(sys.argv[1]) as f:
...@@ -48,7 +48,7 @@ title1 = csv_1.pop(0) ...@@ -48,7 +48,7 @@ title1 = csv_1.pop(0)
file_exists = Path(file2) file_exists = Path(file2)
if not file_exists.is_file(): if not file_exists.is_file():
print("File2 does not exist. Calling script resume_register to create it...") print("File2 does not exist. Calling script resume_register to create it...")
call(["./resume_register.sh " + params['path'] + " " + params['date']], shell=True) call(["./resume_register.sh " + params['path'] + " " + params['date'] + " " + sys.argv[2]], shell=True)
with open(file2, newline='', encoding='Windows-1252') as f: with open(file2, newline='', encoding='Windows-1252') as f:
csv_2 = [ i for i in csv.reader(f, 'dialect') ] csv_2 = [ i for i in csv.reader(f, 'dialect') ]
......
...@@ -10,14 +10,15 @@ ...@@ -10,14 +10,15 @@
# Output: CSV file named YearMonthDay_Cadastro_Ufpr_Unique.csv, in the $path folder. # Output: CSV file named YearMonthDay_Cadastro_Ufpr_Unique.csv, in the $path folder.
# Example of CSV location (using same parameters as input): ../../data/workers/2016-10/20161031_Cadastro_Ufpr_Unique.csv # Example of CSV location (using same parameters as input): ../../data/workers/2016-10/20161031_Cadastro_Ufpr_Unique.csv
path=$1 if [ "$#" -ne 3 ]; then
date=$2 echo "Usage: $0 <path> <date> <filter>"
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <path> <date>"
exit exit
fi fi
path=$1
date=$2
filter=$3
echo "Processing data with args = ${path} and ${date}" echo "Processing data with args = ${path} and ${date}"
input="${path}${date}_Cadastro.csv" input="${path}${date}_Cadastro.csv"
...@@ -36,4 +37,4 @@ columns="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27 ...@@ -36,4 +37,4 @@ columns="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
# cat $input | egrep --binary-files=text "(UNIVERSIDADE FED*|Id_SERVIDOR_PORTAL NOME)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output # cat $input | egrep --binary-files=text "(UNIVERSIDADE FED*|Id_SERVIDOR_PORTAL NOME)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output
# Get only data from UFPR. # Get only data from UFPR.
cat $input | egrep --binary-files=text "(UNIVERSIDADE FEDERAL DO PARANA|Id_SERVIDOR_PORTAL NOME)" | tr -d '\000' > $output cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment