Issue #25: Fix bug getting last day of month in script and add config file

The config file has the following variables: - Host: Host running ElasticSearch. Must not contain "http://". - Index: The name of the index in which data will be inserted. - Filter: The filter to get only part of Portal Transparencia's data. Scripts were getting always the last day of the current month instead of the last day of the parameter's month. Signed-off-by: Cristian Weiland <cw14@inf.ufpr.br>

Issue #25: Fix bug getting last day of month in script and add config file
5f970c9e · Cristian Weiland · 26244a79 · 5f970c9e · 5f970c9e · 5f970c9e
Commit 5f970c9e authored 8 years ago by Cristian Weiland
--- a/scripts/expenses/config.sh
+++ b/scripts/expenses/config.sh
+# This file only contains some config variables:
+# Index prefix: The prefix of the index in elasticsearch. Ex: gastos
+index="gastos-pagamentos"
+# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
+# Ex: Getting only UFPR:
+# filter="UNIVERSIDADE FEDERAL DO PARANA"
+# Getting UFPR and UFMG:
+# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
+# Getting all universities:
+# filter="UNIVERSIDADE FEDERAL*"
+filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
+# Host: ElasticSearch's host. Ex: "localhost"
+host="localhost"
--- a/scripts/expenses/create_expenses_config.py
+++ b/scripts/expenses/create_expenses_config.py
@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess
 from pathlib import Path
 from subprocess import call
-if len(sys.argv) != 6:
+if len(sys.argv) != 8:
-    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>")
+    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
    sys.exit()
 with open('logstash_config.example') as infile:
@@ -19,8 +19,10 @@ with open('logstash_config.example') as infile:
 output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
 					 , "date": sys.argv[1] + '-' + sys.argv[2]
-					 , "user": sys.argv[4]
+                     , "index": sys.argv[4]
-					 , "password": sys.argv[5] }
+                     , "host": sys.argv[5]
+					 , "user": sys.argv[6]
+					 , "password": sys.argv[7] }
 with open('../../configs/expenses/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
 	outfile.write(output)
--- a/scripts/expenses/insert_expenses.sh
+++ b/scripts/expenses/insert_expenses.sh
@@ -19,7 +19,7 @@ fi
 # Getting the Last day of this month (Using date 2016-05-15 as example):
 # First, get next month (201606).
-aux=$(date +%Y%m -d "$(date +%Y%m15) next month")
+aux=$(date +%Y%m -d "$(date +${1}${2}15) next month")
 # Append day 01 (20160601).
 temp=$(date -d "${aux}01")
 # Remove 1 day: 20160531, get only day: 31.
@@ -50,9 +50,13 @@ unzip $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/
 # Remove zip file
 rm $path$ym/${1}${2}_GastosDiretos.zip
+source config.sh
 # Step 2:
-./create_expenses_config.py $1 $2 $day $3 $4
+./create_expenses_config.py $1 $2 $day $index $host $3 $4
 # Step 3:
-./resume_expenses.sh ../../data/expenses/ ${1}-${2}
+./resume_expenses.sh ../../data/expenses/ ${1}-${2} $filter
 # Step 4:
 logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv
+# Data inserted, we can now remove it.
+rm ../../data/expenses/processed/${1}${2}.csv
--- a/scripts/expenses/logstash_config.example
+++ b/scripts/expenses/logstash_config.example
@@ -41,8 +41,8 @@ output {
 		action => "index"
 		user => "%(user)s"
 		password => "%(password)s"
-		hosts => "localhost:9200"
+		hosts => "http://%(host)s:9200"
-		index => "ufpr-gastos-pagamentos-%(date)s"
+		index => "ufpr-%(index)s-%(date)s"
 		workers => 1
 	}
 	stdout {}

--- a/scripts/expenses/resume_expenses.sh
+++ b/scripts/expenses/resume_expenses.sh
@@ -7,18 +7,20 @@
 # Output: A CSV file in folder processed, filtering the data to get only relevant data (in our case, from UFPR).
+if [ "$#" -ne 3 ]; then
+	echo "Usage: $0 <path> <date> <filter>"
+	exit
+fi
 # Path example: ../../data/expenses/
 path=$1
 # Date example: 2016-11
 date=$2
+# Filter example: UNIVERSIDADE FEDERAL DO PARANA
+filter=$3
 # dateWithoutHyphen example: 201611
 dateWithoutHyphen=${date//-}
-if [ "$#" -ne 2 ]; then
-	echo "Usage: $0 <path> <date>"
-	exit
-fi
 echo "Processing data with args = $path and ${date}"
 input="${path}${date}/${dateWithoutHyphen}_GastosDiretos.csv"
@@ -27,7 +29,6 @@ output="${path}processed/${dateWithoutHyphen}.csv"
 # About this command:
 # - Grep removes everyone that does not work in UFPR.
 # - Tr removes null characters (ctrl + @).
-# - Head -n1 gets first line (column names). Then, I append the data.
-head -n1 $input > $output
+cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
-cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output
+rm $input
--- a/scripts/travel_allowances/config.sh
+++ b/scripts/travel_allowances/config.sh
+# This file only contains some config variables:
+# Index prefix: The prefix of the index in elasticsearch. Ex: gastos
+index="gastos-diarias"
+# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
+# Ex: Getting only UFPR:
+# filter="UNIVERSIDADE FEDERAL DO PARANA"
+# Getting UFPR and UFMG:
+# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
+# Getting all universities:
+# filter="UNIVERSIDADE FEDERAL*"
+filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
+# Host: ElasticSearch's host. Examples: "localhost"
+host="localhost"
--- a/scripts/travel_allowances/create_travel_allowance_config.py
+++ b/scripts/travel_allowances/create_travel_allowance_config.py
@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess
 from pathlib import Path
 from subprocess import call
-if len(sys.argv) != 6:
+if len(sys.argv) != 8:
-    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>")
+    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
    sys.exit()
 with open('logstash_config.example') as infile:
@@ -19,8 +19,10 @@ with open('logstash_config.example') as infile:
 output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
 					 , "date": sys.argv[1] + '-' + sys.argv[2]
-					 , "user": sys.argv[4]
+                     , "index": sys.argv[4]
-					 , "password": sys.argv[5] }
+                     , "host": sys.argv[5]
+					 , "user": sys.argv[6]
+					 , "password": sys.argv[7] }
 with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
 	outfile.write(output)
--- a/scripts/travel_allowances/insert_travel_allowances.sh
+++ b/scripts/travel_allowances/insert_travel_allowances.sh
@@ -19,7 +19,7 @@ fi
 # Getting the Last day of this month (Using date 2016-05-15 as example):
 # First, get next month (201606).
-aux=$(date +%Y%m -d "$(date +%Y%m15) next month")
+aux=$(date +%Y%m -d "$(date +${1}${2}15) next month")
 # Append day 01 (20160601).
 temp=$(date -d "${aux}01")
 # Remove 1 day: 20160531, get only day: 31.
@@ -30,6 +30,8 @@ dataPath="../../data/"
 path="../../data/travel_allowance/"
 configPath="../../configs/travel_allowance/logstash/"
+source config.sh
 if [ ! -d "$dataPath" ]; then
 	mkdir "$dataPath"
 fi
@@ -55,8 +57,11 @@ unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/
 rm $path$ym/${1}${2}_Diarias.zip
 # Step 2:
-./create_travel_allowance_config.py $1 $2 $day $3 $4
+./create_travel_allowance_config.py $1 $2 $day $index $host $3 $4
 # Step 3:
-./resume_travel_allowance.sh $path ${1}-${2}
+./resume_travel_allowance.sh $path ${1}-${2} $filter
 # Step 4:
 logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv
+# Remove processed file
+rm ${path}processed/${1}${2}.csv
--- a/scripts/travel_allowances/logstash_config.example
+++ b/scripts/travel_allowances/logstash_config.example
@@ -41,8 +41,8 @@ output {
 		action => "index"
 		user => "%(user)s"
 		password => "%(password)s"
-		hosts => "localhost:9200"
+		hosts => "http://%(host)s:9200"
-		index => "ufpr-gastos-diarias-%(date)s"
+		index => "ufpr-%(index)s-%(date)s"
 		workers => 1
 	}
 	stdout {}

--- a/scripts/travel_allowances/resume_travel_allowance.sh
+++ b/scripts/travel_allowances/resume_travel_allowance.sh
@@ -14,8 +14,8 @@ date=$2
 # dateWithoutHyphen example: 201611
 dateWithoutHyphen=${date//-}
-if [ "$#" -ne 2 ]; then
+if [ "$#" -ne 3 ]; then
-	echo "Usage: $0 <path> <date>"
+	echo "Usage: $0 <path> <date> <filter>"
 	exit
 fi
@@ -27,7 +27,5 @@ output="${path}processed/${dateWithoutHyphen}.csv"
 # About this command:
 # - Grep removes everyone that does not work in UFPR.
 # - Tr removes null characters (ctrl + @).
-# - Head -n1 gets first line (column names). Then, I append the data.
-head -n1 $input > $output
+cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
-cat $input | egrep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | tr -d '\000' >> $output
--- a/scripts/workers/config.sh
+++ b/scripts/workers/config.sh
+# This file only contains some config variables:
+# Index prefix: The prefix of the index in elasticsearch. Ex: gastos
+index="servidores"
+# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
+# Ex: Getting only UFPR:
+# filter="UNIVERSIDADE FEDERAL DO PARANA"
+# Getting UFPR and UFMG:
+# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
+# Getting all universities:
+# filter="UNIVERSIDADE FEDERAL*"
+filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
+# Host: ElasticSearch's host. Examples: "localhost"
+host="localhost"
--- a/scripts/workers/create_config.py
+++ b/scripts/workers/create_config.py
@@ -12,8 +12,8 @@ import sys, csv, json, math, subprocess
 from pathlib import Path
 from subprocess import call
-if len(sys.argv) != 6:
+if len(sys.argv) != 8:
-    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <username> <password>")
+    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
    sys.exit()
 data = {
@@ -41,8 +41,10 @@ else:
 output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00'
 					 , "date": sys.argv[1] + '-' + sys.argv[2]
-					 , "user": sys.argv[4]
+                     , "index": sys.argv[4]
-					 , "password": sys.argv[5] }
+                     , "host": sys.argv[5]
+					 , "user": sys.argv[6]
+					 , "password": sys.argv[7] }
 with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
 	outfile.write(output)
--- a/scripts/workers/insert_register_payment.sh
+++ b/scripts/workers/insert_register_payment.sh
@@ -26,6 +26,8 @@ ym=$1-$2
 dataPath="../../data/"
 path="../../data/workers/"
+source config.sh
 # Check if Data and Workers directories already exist:
 if [ ! -d "$dataPath" ]; then
 	mkdir "$dataPath"
@@ -51,15 +53,17 @@ rm $path$ym/${1}${2}_Servidores.zip
 # Get day
 day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8)
 # Step 2:
 # Create config files
-./create_config.py $1 $2 $day $3 $4
+./create_config.py $1 $2 $day $index $host $3 $4
 # Step 3:
 # Start processing
-./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json
+./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json $filter
 # Step 4:
 # Insert data in ElasticSearch
 logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv
+# Remove data
+rm ../../data/workers/processed/${1}${2}.csv
--- a/scripts/workers/logstash_config.example
+++ b/scripts/workers/logstash_config.example
@@ -58,8 +58,8 @@ output {
 		action => "index"
 		user => "%(user)s"
 		password => "%(password)s"
-		hosts => "http://node1.c3sl.ufpr.br:9200"
+		hosts => "http://$(host):9200"
-		index => "ufpr-servidores-%(date)s"
+		index => "ufpr-$(index)s-%(date)s"
 		workers => 1
 	}
 	stdout {}

--- a/scripts/workers/logstash_config_2013.example
+++ b/scripts/workers/logstash_config_2013.example
@@ -58,8 +58,8 @@ output {
 		action => "index"
 		user => "%(user)s"
 		password => "%(password)s"
-		hosts => "http://node1.c3sl.ufpr.br:9200"
+		hosts => "http://$(host)s:9200"
-		index => "ufpr-servidores-%(date)s"
+		index => "ufpr-%(index)s-%(date)s"
 		workers => 1
 	}
 	stdout {}

--- a/scripts/workers/merge_files_es.py
+++ b/scripts/workers/merge_files_es.py
@@ -22,8 +22,8 @@ import sys, csv, json, math, subprocess
 from pathlib import Path
 from subprocess import call
-if len(sys.argv) != 2:
+if len(sys.argv) != 3:
-    print("Usage: " + sys.argv[0] + " <config.json>")
+    print("Usage: " + sys.argv[0] + " <config.json> <filter>")
    sys.exit()
 with open(sys.argv[1]) as f:
@@ -48,7 +48,7 @@ title1 = csv_1.pop(0)
 file_exists = Path(file2)
 if not file_exists.is_file():
 	print("File2 does not exist. Calling script resume_register to create it...")
-	call(["./resume_register.sh " +  params['path'] + " " + params['date']], shell=True)
+	call(["./resume_register.sh " +  params['path'] + " " + params['date'] + " " + sys.argv[2]], shell=True)
 with open(file2, newline='', encoding='Windows-1252') as f:
    csv_2 = [ i for i in csv.reader(f, 'dialect') ]

--- a/scripts/workers/resume_register.sh
+++ b/scripts/workers/resume_register.sh
@@ -10,14 +10,15 @@
 # Output: CSV file named YearMonthDay_Cadastro_Ufpr_Unique.csv, in the $path folder.
 # Example of CSV location (using same parameters as input): ../../data/workers/2016-10/20161031_Cadastro_Ufpr_Unique.csv
-path=$1
+if [ "$#" -ne 3 ]; then
-date=$2
+	echo "Usage: $0 <path> <date> <filter>"
-if [ "$#" -ne 2 ]; then
-	echo "Usage: $0 <path> <date>"
 	exit
 fi
+path=$1
+date=$2
+filter=$3
 echo "Processing data with args = ${path} and ${date}"
 input="${path}${date}_Cadastro.csv"
@@ -36,4 +37,4 @@ columns="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
 # cat $input | egrep --binary-files=text "(UNIVERSIDADE FED*|Id_SERVIDOR_PORTAL	NOME)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output
 # Get only data from UFPR.
-cat $input | egrep --binary-files=text "(UNIVERSIDADE FEDERAL DO PARANA|Id_SERVIDOR_PORTAL	NOME)" | tr -d '\000' > $output
+cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output