From 0f634a0623992fc6b4422abc8732dfec745d5180 Mon Sep 17 00:00:00 2001
From: Cristian Weiland <cw14@inf.ufpr.br>
Date: Fri, 17 Mar 2017 17:09:17 -0300
Subject: [PATCH] Issue #25: Fix more bugs and allow to insert multiple
 universities

Signed-off-by: Cristian Weiland <cw14@inf.ufpr.br>
---
 scripts/expenses/config.sh                    | 13 ++---
 scripts/expenses/create_expenses_config.py    |  9 ++--
 scripts/expenses/insert_expenses.sh           | 25 +++++----
 scripts/expenses/logstash_config.example      |  2 +-
 scripts/expenses/resume_expenses.sh           |  3 +-
 scripts/expenses/tmp.sh                       | 11 ++++
 scripts/travel_allowances/config.sh           | 13 ++---
 .../create_travel_allowance_config.py         |  9 ++--
 .../insert_travel_allowances.sh               | 25 +++++----
 .../travel_allowances/logstash_config.example |  2 +-
 .../resume_travel_allowance.sh                |  3 +-
 scripts/workers/config.json.example           |  2 +-
 scripts/workers/config.sh                     | 13 ++---
 scripts/workers/create_config.py              | 11 ++--
 scripts/workers/insert_register_payment.sh    | 42 +++++++++------
 scripts/workers/logstash_config.example       |  4 +-
 scripts/workers/logstash_config_2013.example  |  2 +-
 scripts/workers/merge_files_es.py             | 51 ++++++-------------
 scripts/workers/resume_register.sh            |  6 ++-
 19 files changed, 127 insertions(+), 119 deletions(-)
 create mode 100755 scripts/expenses/tmp.sh

diff --git a/scripts/expenses/config.sh b/scripts/expenses/config.sh
index 9700889..5c22af6 100644
--- a/scripts/expenses/config.sh
+++ b/scripts/expenses/config.sh
@@ -4,15 +4,12 @@
 
 index="gastos-pagamentos"
 
-# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
-# Ex: Getting only UFPR:
-# filter="UNIVERSIDADE FEDERAL DO PARANA"
-# Getting UFPR and UFMG:
-# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
-# Getting all universities:
-# filter="UNIVERSIDADE FEDERAL*"
+# Filter: An array of strings that will be used on 'egrep' to filter data to get only relevant universities.
+# University: An array of initials, corresponding to Filter.
+# Warning: Filter's length must be the same as university's!!
 
-filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
+filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA")
+university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm")
 
 # Host: ElasticSearch's host. Ex: "localhost"
 
diff --git a/scripts/expenses/create_expenses_config.py b/scripts/expenses/create_expenses_config.py
index 0c86e6a..be852fb 100755
--- a/scripts/expenses/create_expenses_config.py
+++ b/scripts/expenses/create_expenses_config.py
@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess
 from pathlib import Path
 from subprocess import call
 
-if len(sys.argv) != 8:
-    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
+if len(sys.argv) != 9:
+    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>")
     sys.exit()
 
 with open('logstash_config.example') as infile:
@@ -21,8 +21,9 @@ output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.ar
 					 , "date": sys.argv[1] + '-' + sys.argv[2]
                      , "index": sys.argv[4]
                      , "host": sys.argv[5]
-					 , "user": sys.argv[6]
-					 , "password": sys.argv[7] }
+                     , "university": sys.argv[6]
+					 , "user": sys.argv[7]
+					 , "password": sys.argv[8] }
 
 with open('../../configs/expenses/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
 	outfile.write(output)
diff --git a/scripts/expenses/insert_expenses.sh b/scripts/expenses/insert_expenses.sh
index 877b683..3ec22d5 100755
--- a/scripts/expenses/insert_expenses.sh
+++ b/scripts/expenses/insert_expenses.sh
@@ -46,20 +46,25 @@ request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='
 curl -o $path$ym/${1}${2}_GastosDiretos.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed
 
 # Unzip them
-unzip $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/
+unzip -o $path$ym/${1}${2}_GastosDiretos.zip -d $path$ym/
 
 # Remove zip file
 rm $path$ym/${1}${2}_GastosDiretos.zip
 
 source ./config.sh
 
-echo $filter
+length=${#filter[@]}
 
-# Step 2:
-./create_expenses_config.py $1 $2 $day $index $host $3 $4
-# Step 3:
-./resume_expenses.sh ../../data/expenses/ ${1}-${2} "$filter"
-# Step 4:
-logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv
-# Data inserted, we can now remove it.
-rm ../../data/expenses/processed/${1}${2}.csv
+for (( i=0; i<${length}; i++ ));
+do
+    # Step 2:
+    ./create_expenses_config.py $1 $2 "$day" "$index" "$host" "${university[$i]}" $3 $4
+    # Step 3:
+    ./resume_expenses.sh "${path}" ${1}-${2} "${filter[$i]}"
+    # Step 4:
+    logstash -f ../../configs/expenses/logstash/config-${1}-${2} < ../../data/expenses/processed/${1}${2}.csv
+    # Data inserted, we can now remove it.
+    rm ../../data/expenses/processed/${1}${2}.csv
+done
+
+rm $path${1}-${2}/${1}${2}_GastosDiretos.csv
diff --git a/scripts/expenses/logstash_config.example b/scripts/expenses/logstash_config.example
index b1f680f..249d217 100644
--- a/scripts/expenses/logstash_config.example
+++ b/scripts/expenses/logstash_config.example
@@ -42,7 +42,7 @@ output {
 		user => "%(user)s"
 		password => "%(password)s"
 		hosts => "http://%(host)s:9200"
-		index => "ufpr-%(index)s-%(date)s"
+		index => "%(index)s-%(university)s-%(date)s"
 		workers => 1
 	}
 	stdout {}
diff --git a/scripts/expenses/resume_expenses.sh b/scripts/expenses/resume_expenses.sh
index a389304..38bd5d9 100755
--- a/scripts/expenses/resume_expenses.sh
+++ b/scripts/expenses/resume_expenses.sh
@@ -32,5 +32,4 @@ fi
 # - Grep removes everyone that does not work in UFPR.
 # - Tr removes null characters (ctrl + @).
 
-cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
-rm $input
+cat "$input" | egrep --binary-files=text "$filter" | tr -d '\000' > "$output"
diff --git a/scripts/expenses/tmp.sh b/scripts/expenses/tmp.sh
new file mode 100755
index 0000000..38f153c
--- /dev/null
+++ b/scripts/expenses/tmp.sh
@@ -0,0 +1,11 @@
+filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA")
+university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm")
+
+length=${#filter[@]}
+
+for (( i=0; i<${length}; i++ ));
+do
+    echo $i
+    echo ${filter[$i]}
+    echo ${university[$i]}
+done
diff --git a/scripts/travel_allowances/config.sh b/scripts/travel_allowances/config.sh
index a1897f5..3cf088a 100644
--- a/scripts/travel_allowances/config.sh
+++ b/scripts/travel_allowances/config.sh
@@ -4,15 +4,12 @@
 
 index="gastos-diarias"
 
-# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
-# Ex: Getting only UFPR:
-# filter="UNIVERSIDADE FEDERAL DO PARANA"
-# Getting UFPR and UFMG:
-# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
-# Getting all universities:
-# filter="UNIVERSIDADE FEDERAL*"
+# Filter: An array of strings that will be used on 'egrep' to filter data to get only relevant universities.
+# University: An array of initials, corresponding to Filter.
+# Warning: Filter's length must be the same as university's!!
 
-filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
+filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA")
+university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm")
 
 # Host: ElasticSearch's host. Examples: "localhost"
 
diff --git a/scripts/travel_allowances/create_travel_allowance_config.py b/scripts/travel_allowances/create_travel_allowance_config.py
index 5743448..11f340f 100755
--- a/scripts/travel_allowances/create_travel_allowance_config.py
+++ b/scripts/travel_allowances/create_travel_allowance_config.py
@@ -10,8 +10,8 @@ import sys, csv, json, math, subprocess
 from pathlib import Path
 from subprocess import call
 
-if len(sys.argv) != 8:
-    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
+if len(sys.argv) != 9:
+    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>")
     sys.exit()
 
 with open('logstash_config.example') as infile:
@@ -21,8 +21,9 @@ output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.ar
 					 , "date": sys.argv[1] + '-' + sys.argv[2]
                      , "index": sys.argv[4]
                      , "host": sys.argv[5]
-					 , "user": sys.argv[6]
-					 , "password": sys.argv[7] }
+                     , "university": sys.argv[6]
+					 , "user": sys.argv[7]
+					 , "password": sys.argv[8] }
 
 with open('../../configs/travel_allowance/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
 	outfile.write(output)
diff --git a/scripts/travel_allowances/insert_travel_allowances.sh b/scripts/travel_allowances/insert_travel_allowances.sh
index 844870e..fb2b6b3 100755
--- a/scripts/travel_allowances/insert_travel_allowances.sh
+++ b/scripts/travel_allowances/insert_travel_allowances.sh
@@ -45,20 +45,25 @@ mkdir -p $path$ym
 
 # Download files
 request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&consulta=Diarias'
-curl -o $path$ym/${1}${2}_Diarias.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed
+curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://transparencia.gov.br/downloads/mensal.asp?c=GastosDiretos' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBCEEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Diarias.zip
 
 # Unzip them
-unzip $path$ym/${1}${2}_Diarias.zip -d $path$ym/
+unzip -o $path$ym/${1}${2}_Diarias.zip -d $path$ym/
 
 # Remove zip file
 rm $path$ym/${1}${2}_Diarias.zip
 
-# Step 2:
-./create_travel_allowance_config.py $1 $2 "$day" "$index" "$host" $3 $4
-# Step 3:
-./resume_travel_allowance.sh $path ${1}-${2} "$filter"
-# Step 4:
-logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv
+length=${#filter[@]}
 
-# Remove processed file
-rm ${path}processed/${1}${2}.csv
+for (( i=0; i<${length}; i++ ));
+do
+    # Step 2:
+    ./create_travel_allowance_config.py $1 $2 "$day" "$index" "$host" "${university[$i]}" $3 $4
+    # Step 3:
+    echo "${filter[$i]}"
+    ./resume_travel_allowance.sh "$path" ${1}-${2} "${filter[$i]}"
+    # Step 4:
+    logstash -f ../../configs/travel_allowance/logstash/config-${1}-${2} < ${path}processed/${1}${2}.csv
+    # Remove processed file
+    rm ${path}processed/${1}${2}.csv
+done
diff --git a/scripts/travel_allowances/logstash_config.example b/scripts/travel_allowances/logstash_config.example
index 20dc7e9..43f956f 100644
--- a/scripts/travel_allowances/logstash_config.example
+++ b/scripts/travel_allowances/logstash_config.example
@@ -42,7 +42,7 @@ output {
 		user => "%(user)s"
 		password => "%(password)s"
 		hosts => "http://%(host)s:9200"
-		index => "ufpr-%(index)s-%(date)s"
+		index => "%(index)s-%(university)s-%(date)s"
 		workers => 1
 	}
 	stdout {}
diff --git a/scripts/travel_allowances/resume_travel_allowance.sh b/scripts/travel_allowances/resume_travel_allowance.sh
index ecac412..9e43825 100755
--- a/scripts/travel_allowances/resume_travel_allowance.sh
+++ b/scripts/travel_allowances/resume_travel_allowance.sh
@@ -13,6 +13,7 @@ path=$1
 date=$2
 # dateWithoutHyphen example: 201611
 dateWithoutHyphen=${date//-}
+filter=$3
 
 if [ "$#" -ne 3 ]; then
 	echo "Usage: $0 <path> <date> <filter>"
@@ -30,4 +31,4 @@ fi
 # - Grep removes everyone that does not work in UFPR.
 # - Tr removes null characters (ctrl + @).
 
-cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
+cat "$input" | egrep --binary-files=text "$filter" | tr -d '\000' > "$output"
diff --git a/scripts/workers/config.json.example b/scripts/workers/config.json.example
index 8dbce65..01a2d48 100644
--- a/scripts/workers/config.json.example
+++ b/scripts/workers/config.json.example
@@ -2,7 +2,7 @@
 	"path": "Dados_Servidores/2016-10/"
 	, "date": "20161031"
     , "file1" : "_Remuneracao.csv"
-    , "file2" : "_Cadastro_Ufpr_Unique.csv"
+    , "file2" : "_Cadastro_Unique.csv"
     , "idColumn1" : 2
     , "idColumn2" : 0
 	, "quotechar": "\""
diff --git a/scripts/workers/config.sh b/scripts/workers/config.sh
index b3fe041..08395d6 100644
--- a/scripts/workers/config.sh
+++ b/scripts/workers/config.sh
@@ -4,15 +4,12 @@
 
 index="servidores"
 
-# Filter: The string that will be used on 'egrep' to filter data to get only relevant universities.
-# Ex: Getting only UFPR:
-# filter="UNIVERSIDADE FEDERAL DO PARANA"
-# Getting UFPR and UFMG:
-# filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS"
-# Getting all universities:
-# filter="UNIVERSIDADE FEDERAL*"
+# Filter: An array of strings that will be used on 'egrep' to filter data to get only relevant universities.
+# University: An array of initials, corresponding to Filter.
+# Warning: Filter's length must be the same as university's!!
 
-filter="UNIVERSIDADE FEDERAL DO PARANA|UNIVERSIDADE FEDERAL DE MINAS GERAIS|UNIVERSIDADE FEDERAL DE SANTA CATARINA|UNIVERSIDADE FEDERAL DE PERNAMBUCO|UNIVERSIDADE FEDERAL DE SANTA MARIA"
+filter=("UNIVERSIDADE FEDERAL DO PARANA" "UNIVERSIDADE FEDERAL DE MINAS GERAIS" "UNIVERSIDADE FEDERAL DE SANTA CATARINA" "UNIVERSIDADE FEDERAL DE PERNAMBUCO" "UNIVERSIDADE FEDERAL DE SANTA MARIA")
+university=("ufpr" "ufmg" "ufsc" "ufpe" "ufsm")
 
 # Host: ElasticSearch's host. Examples: "localhost"
 
diff --git a/scripts/workers/create_config.py b/scripts/workers/create_config.py
index 9bc0b15..5594b03 100755
--- a/scripts/workers/create_config.py
+++ b/scripts/workers/create_config.py
@@ -12,15 +12,15 @@ import sys, csv, json, math, subprocess
 from pathlib import Path
 from subprocess import call
 
-if len(sys.argv) != 8:
-    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <username> <password>")
+if len(sys.argv) != 9:
+    print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)> <index> <host> <university> <username> <password>")
     sys.exit()
 
 data = {
 	"path": "../../data/workers/" + sys.argv[1] + "-" + sys.argv[2] + "/"
 	, "date": sys.argv[1] + sys.argv[2] + sys.argv[3]
 	, "file1": "_Remuneracao.csv"
-	, "file2": "_Cadastro_Ufpr_Unique.csv"
+	, "file2": "_Cadastro_Unique.csv"
 	, "idColumn1": 2
 	, "idColumn2": 0
 	, "quotechar": "\""
@@ -43,8 +43,9 @@ output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.ar
 					 , "date": sys.argv[1] + '-' + sys.argv[2]
                      , "index": sys.argv[4]
                      , "host": sys.argv[5]
-					 , "user": sys.argv[6]
-					 , "password": sys.argv[7] }
+                     , "university": sys.argv[6]
+					 , "user": sys.argv[7]
+					 , "password": sys.argv[8] }
 
 with open('../../configs/workers/logstash/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile:
 	outfile.write(output)
diff --git a/scripts/workers/insert_register_payment.sh b/scripts/workers/insert_register_payment.sh
index cdc4ee3..7e286fd 100755
--- a/scripts/workers/insert_register_payment.sh
+++ b/scripts/workers/insert_register_payment.sh
@@ -43,13 +43,14 @@ fi
 # Step 1:
 # Create directory to store files
 mkdir -p $path$ym
+mkdir -p ${path}processed/
 
 # Download files
 request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&d=C&consulta=Servidores'
-curl -o $path$ym/${1}${2}_Servidores.zip $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_    64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar    encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC    EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed
+curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_    64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar    encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC    EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Servidores.zip
 
 # Unzip them
-unzip $path$ym/${1}${2}_Servidores.zip -d $path$ym/
+unzip -o $path$ym/${1}${2}_Servidores.zip -d $path$ym/
 
 # Remove zip file
 rm $path$ym/${1}${2}_Servidores.zip
@@ -57,17 +58,26 @@ rm $path$ym/${1}${2}_Servidores.zip
 # Get day
 day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8)
 
-# Step 2:
-# Create config files
-./create_config.py $1 $2 "$day" "$index" "$host" $3 $4
-
-# Step 3:
-# Start processing
-./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json "$filter"
-
-# Step 4:
-# Insert data in ElasticSearch
-logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv
-
-# Remove data
-rm ../../data/workers/processed/${1}${2}.csv
+length=${#filter[@]}
+
+for (( i=0; i<${length}; i++ ));
+do
+    # Step 2:
+    # Create config files
+    ./create_config.py $1 $2 "$day" "$index" "$host" "${university[$i]}" $3 $4
+
+    # Step 3:
+    # Start processing
+    aux=$( echo "${filter[$i]}" | sed 's/ /\\ /g' )
+    ./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json "$aux"
+    echo "removing..."
+    rm $path$ym/${1}${2}${day}_Cadastro_Unique.csv
+    echo "success"
+
+    # Step 4:
+    # Insert data in ElasticSearch
+    logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv
+
+    # Remove data
+    #rm ../../data/workers/processed/${1}${2}.csv
+done
diff --git a/scripts/workers/logstash_config.example b/scripts/workers/logstash_config.example
index 0383a39..b96a760 100644
--- a/scripts/workers/logstash_config.example
+++ b/scripts/workers/logstash_config.example
@@ -58,8 +58,8 @@ output {
 		action => "index"
 		user => "%(user)s"
 		password => "%(password)s"
-		hosts => "http://$(host):9200"
-		index => "ufpr-$(index)s-%(date)s"
+		hosts => "http://%(host)s:9200"
+		index => "%(index)s-%(university)s-%(date)s"
 		workers => 1
 	}
 	stdout {}
diff --git a/scripts/workers/logstash_config_2013.example b/scripts/workers/logstash_config_2013.example
index b7af5c0..7749904 100644
--- a/scripts/workers/logstash_config_2013.example
+++ b/scripts/workers/logstash_config_2013.example
@@ -59,7 +59,7 @@ output {
 		user => "%(user)s"
 		password => "%(password)s"
 		hosts => "http://$(host)s:9200"
-		index => "ufpr-%(index)s-%(date)s"
+		index => "%(index)s-%(university)s-%(date)s"
 		workers => 1
 	}
 	stdout {}
diff --git a/scripts/workers/merge_files_es.py b/scripts/workers/merge_files_es.py
index 68b3175..14cfab3 100755
--- a/scripts/workers/merge_files_es.py
+++ b/scripts/workers/merge_files_es.py
@@ -4,18 +4,18 @@
 
 # Script made to create a CSV that will be inserted in ElasticSearch.
 # This script is being used to merge two files: a Remuneration report (ex: 20161031_Remuneracao.csv) with a file that contains the Portal ID from UFPR people
-# (ex: 20161031_Cadastro_Ufpr_Unique.csv). This second file can be obtained filtering a Register report (ex: 20161031_Cadastro.csv) using resume_register.sh.
+# (ex: 20161031_Cadastro_Unique.csv). This second file can be obtained filtering a Register report (ex: 20161031_Cadastro.csv) using resume_register.sh.
 
 # Input: A configuration file, in the same format as the example. This configuration file can be generated by create_config.py.
 
 # Documentation of config.json.example:
 # - Variables ending with number 1 represent something from the first file, while the ones that end with number 2 represent the same thing in the second file.
-# - File1 and File2 are the files that will be merged. File1 name is "*_Cadastro_Ufpr_Unique.csv", File2 name is "*_Remuneracao.csv".
+# - File1 and File2 are the files that will be merged. File1 name is "*_Cadastro_Unique.csv", File2 name is "*_Remuneracao.csv".
 # - IdColumn1 and IdColumn2 represent the common column for each CSV (Ex: an ID column).
 # - Quotechar, Delimiter and LineTerminator are the CSV's quote char, delimiter and line terminator, respectively.
 # - OutputFile is the name of the output file (the result CSV).
 
-# Output: A CSV that will contain every row from the second file (*_Cadastro_Ufpr_Unique.csv). From the first file (*_Remuneracao.csv),
+# Output: A CSV that will contain every row from the second file (*_Cadastro_Unique.csv). From the first file (*_Remuneracao.csv),
 # I will get only data thats in the second file as well. This means some people in our data will not have data from Remuneracao.csv.
 
 import sys, csv, json, math, subprocess
@@ -78,51 +78,32 @@ def getDataWithEmptyRow(columns, row):
     return newRow
 
 result = []
-count = 0
 hits = 0
 errors = 0
-previous = 0
-progress = 0
-const = 100 / len(csv_2)
-
-print("Preparing data...")
 
 # Get number of columns in file 1
 columns1 = len(csv_1[0])
 
-# Separate id_point from useless data in file 2 and append points in result array.
+# Create dictionary...
+data = {}
+for row in csv_1:
+    data[row[idPointColumn1]] = row
 
 for row2 in csv_2:
-    count += 1
-    if(count % 10) == 0:
-        previous = progress
-        progress = math.floor(count * const)
-        if(progress != previous):
-            print(str(progress) + '% completed.')
-        #print(count)
-    # I have IdPoint. Find the correspondent one in the other csv
-    # and add data from file 2 to file 2.
-    found = False
-    for row1 in csv_1:
-        if row1[idPointColumn1] == row2[idPointColumn2]:
-            newRow = getDataFromRows(row1, row2)
-            # To make sure we wont get the same point twice.
-            row1[idPointColumn1] = -1;
-            row2[idPointColumn2] = -1;
-            result.append(newRow)
-            found = True
-            hits += 1
-            break
-    if not found:
+    if row2[idPointColumn2] in data:
+        newRow = getDataFromRows(data[row2[idPointColumn2]], row2)
+        # To make sure we wont get the same point twice.
+        del data[row2[idPointColumn2]]
+        row2[idPointColumn2] = -1;
+        result.append(newRow)
+        hits += 1
+    else:
 		# This guy was in the second file, but not in the first one. Add him, but with null values in the second file.
         newRow = getDataWithEmptyRow(columns1, row2)
         result.append(newRow)
         errors += 1
 
-count = 0
-const = 50 / len(csv_1)
-
-print("Number of rows in file 2 but not in file 1: " + str(errors))
+#print("Number of rows in file 2 but not in file 1: " + str(errors))
 print("Saving data...")
 
 with open(params['outputFile'], 'w', newline='') as csvfile:
diff --git a/scripts/workers/resume_register.sh b/scripts/workers/resume_register.sh
index ea36653..897055c 100755
--- a/scripts/workers/resume_register.sh
+++ b/scripts/workers/resume_register.sh
@@ -20,7 +20,7 @@ date=$2
 filter=$3
 
 input="${path}${date}_Cadastro.csv"
-output="${path}${date}_Cadastro_Ufpr_Unique.csv"
+output="${path}${date}_Cadastro_Unique.csv"
 
 if [ ! -d "${path}" ]; then
     mkdir -p "${path}"
@@ -35,5 +35,7 @@ fi
 # Get data from all universities.
 # cat $input | egrep --binary-files=text "(UNIVERSIDADE FED*)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output
 
+echo $filter
+
 # Get only data from UFPR.
-cat $input | egrep --binary-files=text "$filter" | tr -d '\000' > $output
+cat "$input" | egrep --binary-files=text "$filter" | tr -d '\000' > "$output"
-- 
GitLab