insert_register_payment.sh

#!/bin/bash

# Script to help using other scripts. Note that calling it to a data that has already been inserted will DUPLICATE it (which we probably dont want).

# This scripts does 4 things:
#   1- Download required files and store them in the right place.
#   2- Create config files via create_config.py
#   3- Merge CSV data and create a new CSV file via merge_files_es.py.
#   4- Insert CSV file generated in step 2 into ElasticSearch via Logstash.

# Input: Year, Month from CSV file, ElasticSearch's user and password.
# Example (inserting data from file 20130930_Cadastro.csv): ./insert_register_payment.sh 2013 09 myuser mypassword
# If you want to look at more examples, check add_registers.sh.

# Output: The same output as the scripts and commands called.

# WARNING: We get the day from the CSV file by using cut in characters 7 and 8. This means we assume they will write something like 01 as day 1. If they change it to 1, this script will not work!

if [ "$#" -ne 4 ]; then
	echo "Usage: $0 <year> <month> <user> <password>"
	echo "Example: $0 2016 12 myuser mypassword"
	exit
fi

source config.sh

if [ -z ${index+x} ]; then
    echo "Var 'index' is unset. Set it in file 'scripts/workers/config.sh'.";
    exit;
fi
if [ -z ${host+x} ]; then
    echo "Var 'host' is unset. Set it in file 'scripts/workers/config.sh'.";
    exit;
fi
size=${#filter[@]}
if [ "$size" -lt 1 ]; then
    echo "Var 'filter' is unset. Set it in file 'scripts/expenses/config.sh'.";
    exit;
fi

ym=$1-$2
dataPath="../../data/"
path="../../data/workers/"
configPath="../../configs/workers/"

# Check if Data and Workers directories already exist:
if [ ! -d "$path" ]; then
	mkdir -p "$path"
fi
if [ ! -d "$configPath/json" ]; then
	mkdir -p "$configPath/json"
fi
if [ ! -d "$configPath/logstash" ]; then
	mkdir -p "$configPath/logstash"
fi

# Step 1:
# Create directory to store files
mkdir -p $path$ym
mkdir -p ${path}processed/

# Download files
request='http://arquivos.portaldatransparencia.gov.br/downloads.asp?a='${1}'&m='${2}'&d=C&consulta=Servidores'
curl $request -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_    64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.portaldatranspar    encia.gov.br/downloads/servidores.asp' -H 'Cookie: ASPSESSIONIDAQRABSAD=OJDLNBCANLIDINCHJHELHHFB; ASPSESSIONIDAQSDCQAD=BOKBKPNCDKOBJKGAMMEKADFL; _ga=GA1.3.1927288562.1481545643; ASPSESSIONIDSCSBBTCD=IGJLJBBC    EEJBGLOOJKGNMHBH' -H 'Connection: keep-alive' --compressed > $path$ym/${1}${2}_Servidores.zip

# Unzip them
unzip -o $path$ym/${1}${2}_Servidores.zip -d $path$ym/

# Remove zip file
rm $path$ym/${1}${2}_Servidores.zip

# Get day
day=$(ls $path$ym | grep -m 1 $1$2 | cut -c 7,8)

for key in "${!filter[@]}"
do
    # Step 2:
    # Create config files
    ./create_config.py $1 $2 "$day" "$index" "$host" "$key" $3 $4

    # Step 3:
    # Start processing
    aux=$( echo "${filter[$key]}" | sed 's/ /\\ /g' )
    ./merge_files_es.py ../../configs/workers/json/config-${1}-${2}.json "$aux"
    rm $path$ym/${1}${2}${day}_Cadastro_Unique.csv

    # Step 4:
    # Insert data in ElasticSearch
    logstash -f ../../configs/workers/logstash/config-${1}-${2} < ../../data/workers/processed/${1}${2}.csv

    # Remove data
    rm ../../data/workers/processed/${1}${2}.csv
done