diff --git a/.gitignore b/.gitignore index 751d80a41718c3506cdaeed25dca93d660b666bd..ef9ffce5a3513eab4c6abb309fcf1e2505eb28e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ Dados_Servidores +logstash_configs +configs diff --git a/config.json.example b/config.json.example index f804ba63c6208ab89c85fa450f3d6edb3723bdbf..8dbce659a0b18040c139f0848f9d8229927b4bc7 100644 --- a/config.json.example +++ b/config.json.example @@ -1,16 +1,12 @@ { - "comment": "this file's documentation is in merge_csv_files.py" - , "path": "Dados_Servidores/2016-10/" + "path": "Dados_Servidores/2016-10/" , "date": "20161031" , "file1" : "_Remuneracao.csv" , "file2" : "_Cadastro_Ufpr_Unique.csv" , "idColumn1" : 2 , "idColumn2" : 0 - , "columnsToAdd1" : [2, 4, 5] - , "columnsToAdd2" : [] + , "quotechar": "\"" , "delimiter": "\t" , "lineterminator": "\n" - , "outputFile": "remuneracao+ufpr.csv" - , "notFoundFile1": "not_found_1.txt" - , "notFoundFile2": "not_found_2.txt" + , "outputFile": "Dados_Servidores/Processados/201610.csv" } diff --git a/configs/.config-2016-01.json.swp b/configs/.config-2016-01.json.swp deleted file mode 100644 index c1c86e6d326d89b4dc2794769f9da0f1717c63de..0000000000000000000000000000000000000000 Binary files a/configs/.config-2016-01.json.swp and /dev/null differ diff --git a/create_config.py b/create_config.py new file mode 100755 index 0000000000000000000000000000000000000000..24424f07bffabeb1483ee322342984b437117402 --- /dev/null +++ b/create_config.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import sys, csv, json, math, subprocess +from pathlib import Path +from subprocess import call + +if len(sys.argv) != 4: + print("Usage: " + sys.argv[0] + " <year (2016)> <month (01)> <day (31)>") + sys.exit() + +data = { + "path": "Dados_Servidores/" + sys.argv[1] + "-" + sys.argv[2] + "/" + , "date": sys.argv[1] + sys.argv[2] + sys.argv[3] + , "file1": "_Remuneracao.csv" + , "file2": "_Cadastro_Ufpr_Unique.csv" + , "idColumn1": 2 + , "idColumn2": 0 + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "Dados_Servidores/Processados/" + sys.argv[1] + sys.argv[2] + ".csv" +} + +with open('configs/config-' + sys.argv[1] + '-' + sys.argv[2] + '.json', 'w') as outfile: + json.dump(data, outfile, indent=4, sort_keys=True) + +with open('logstash_config.example') as infile: + example = infile.read() + +output = example % { "timestamp": sys.argv[3] + '/' + sys.argv[2] + '/' + sys.argv[1] + ' 00:00:00' + , "date": sys.argv[1] + '-' + sys.argv[2] } + +with open('logstash_configs/config-' + sys.argv[1] + '-' + sys.argv[2], 'w') as outfile: + outfile.write(output) diff --git a/insert_data.sh b/insert_data.sh new file mode 100755 index 0000000000000000000000000000000000000000..01a692499491c513ad4fcb4ab44d893222c70378 --- /dev/null +++ b/insert_data.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo "Usage: $0 <year> <month> <day>" + +./create_config.py $1 $2 $3 +./merge_files_es.py configs/config-${1}-${2}.json +logstash -f logstash_configs/config-${1}-${2} < ~/transparencia/Dados_Servidores/Processados/${1}${2}.csv diff --git a/logstash_config.example b/logstash_config.example new file mode 100644 index 0000000000000000000000000000000000000000..bfe3b5586dbe81c3818276b1e20f88c7f545eee4 --- /dev/null +++ b/logstash_config.example @@ -0,0 +1,66 @@ +input { + stdin { + codec => plain { + charset => "Windows-1252" + } + } +} + +filter { + csv { + columns => [ "ID_SERVIDOR_PORTAL", "NOME", "CPF", "MATRICULA", "DESCRICAO_CARGO", "CLASSE_CARGO", "REFERENCIA_CARGO", "PADRAO_CARGO", "NIVEL_CARGO", "SIGLA_FUNCAO", "NIVEL_FUNCAO", "FUNCAO", "CODIGO_ATIVIDADE", "ATIVIDADE", "OPCAO_PARCIAL", "COD_UORG_LOTACAO", "UORG_LOTACAO", "COD_ORG_LOTACAO", "ORG_LOTACAO", "COD_ORGSUP_LOTACAO", "ORGSUP_LOTACAO", "COD_UORG_EXERCICIO", "UORG_EXERCICIO", "COD_ORG_EXERCICIO", "ORG_EXERCICIO", "COD_ORGSUP_EXERCICIO", "ORGSUP_EXERCICIO", "TIPO_VINCULO", "SITUACAO_VINCULO", "DATA_INICIO_AFASTAMENTO", "DATA_TERMINO_AFASTAMENTO", "REGIME_JURIDICO", "JORNADA_DE_TRABALHO", "DATA_INGRESSO_CARGOFUNCAO", "DATA_NOMEACAO_CARGOFUNCAO", "DATA_INGRESSO_ORGAO", "DOCUMENTO_INGRESSO_SERVICOPUBLICO", "DATA_DIPLOMA_INGRESSO_SERVICOPUBLICO", "DIPLOMA_INGRESSO_CARGOFUNCAO", "DIPLOMA_INGRESSO_ORGAO", "DIPLOMA_INGRESSO_SERVICOPUBLICO", "UF_EXERCICIO", "ANO", "MES", "ID_SERVIDOR_PORTAL", "CPF", "NOME", "REMUNERAÇÃO BÃSICA BRUTA (R$)", "REMUNERAÇÃO BÃSICA BRUTA (U$)", "ABATE-TETO (R$)", "ABATE-TETO (U$)", "GRATIFICAÇÃO NATALINA (R$)", "GRATIFICAÇÃO NATALINA (U$)", "ABATE-TETO DA GRATIFICAÇÃO NATALINA (R$)", "ABATE-TETO DA GRATIFICAÇÃO NATALINA (U$)", "FÉRIAS (R$)", "FÉRIAS (U$)", "OUTRAS REMUNERAÇÕES EVENTUAIS (R$)", "OUTRAS REMUNERAÇÕES EVENTUAIS (U$)", "IRRF (R$)", "IRRF (U$)", "PSS/RPGS (R$)", "PSS/RPGS (U$)", "PENSÃO MILITAR (R$)", "PENSÃO MILITAR (U$)", "FUNDO DE SAÚDE (R$)", "FUNDO DE SAÚDE (U$)", "DEMAIS DEDUÇÕES (R$)", "DEMAIS DEDUÇÕES (U$)", "REMUNERAÇÃO APÓS DEDUÇÕES OBRIGATÓRIAS (R$)", "REMUNERAÇÃO APÓS DEDUÇÕES OBRIGATÓRIAS (U$)", "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - CIVIL (R$)(*)", "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - CIVIL (U$)(*) ", "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - MILITAR (R$)(*)", "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - MILITAR (U$)(*)", "TOTAL DE VERBAS INDENIZATÓRIAS (R$)(*)", "TOTAL DE VERBAS INDENIZATÓRIAS (U$)(*)", "TOTAL DE HONORÃRIOS (JETONS)"] + separator => " " + add_field => { "timestamp" => "%(timestamp)s" } + } + mutate { + convert => { "ANO" => "integer" } + convert => { "MES" => "integer" } + convert => { "REMUNERAÇÃO BÃSICA BRUTA (R$)" => "float" } + convert => { "REMUNERAÇÃO BÃSICA BRUTA (U$)" => "float" } + convert => { "ABATE-TETO (R$)" => "float" } + convert => { "ABATE-TETO (U$)" => "float" } + convert => { "GRATIFICAÇÃO NATALINA (R$)" => "float" } + convert => { "GRATIFICAÇÃO NATALINA (U$)" => "float" } + convert => { "ABATE-TETO DA GRATIFICAÇÃO NATALINA (R$)" => "float" } + convert => { "ABATE-TETO DA GRATIFICAÇÃO NATALINA (U$)" => "float" } + convert => { "FÉRIAS (R$)" => "float" } + convert => { "FÉRIAS (U$)" => "float" } + convert => { "OUTRAS REMUNERAÇÕES EVENTUAIS (R$)" => "float" } + convert => { "OUTRAS REMUNERAÇÕES EVENTUAIS (U$)" => "float" } + convert => { "IRRF (R$)" => "float" } + convert => { "IRRF (U$)" => "float" } + convert => { "PSS/RPGS (R$)" => "float" } + convert => { "PSS/RPGS (U$)" => "float" } + convert => { "PENSÃO MILITAR (R$)" => "float" } + convert => { "PENSÃO MILITAR (U$)" => "float" } + convert => { "FUNDO DE SAÚDE (R$)" => "float" } + convert => { "FUNDO DE SAÚDE (U$)" => "float" } + convert => { "DEMAIS DEDUÇÕES (R$)" => "float" } + convert => { "DEMAIS DEDUÇÕES (U$)" => "float" } + convert => { "REMUNERAÇÃO APÓS DEDUÇÕES OBRIGATÓRIAS (R$)" => "float" } + convert => { "REMUNERAÇÃO APÓS DEDUÇÕES OBRIGATÓRIAS (U$)" => "float" } + convert => { "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - CIVIL (R$)(*)" => "float" } + convert => { "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - CIVIL (U$)(*)" => "float" } + convert => { "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - MILITAR (R$)(*)" => "float" } + convert => { "VERBAS INDENIZATÓRIAS REGISTRADAS EM SISTEMAS DE PESSOAL - MILITAR (U$)(*)" => "float" } + convert => { "TOTAL DE VERBAS INDENIZATÓRIAS (R$)(*)" => "float" } + convert => { "TOTAL DE VERBAS INDENIZATÓRIAS (U$)(*)" => "float" } + convert => { "TOTAL DE HONORÃRIOS (JETONS)" => "float" } + } + date { + match => [ "timestamp", "dd/MM/YYYY HH:mm:ss", "ISO8601" ] + target => [ "@timestamp" ] + } +} + +output { + elasticsearch { + action => "index" + user => "cw14" + password => "123mudar" + hosts => "http://node1.c3sl.ufpr.br:9200" + index => "ufpr-csv-%(date)s" + workers => 1 + } + stdout {} +} diff --git a/merge_files_es.py b/merge_files_es.py index 96a4e416ebc5d62ca6db6c6e33d883d4113e7198..48517fcd04d750d123d5ee36bab165b8b4834b7b 100755 --- a/merge_files_es.py +++ b/merge_files_es.py @@ -42,14 +42,6 @@ file2 = params['path'] + params['date'] + params['file2'] idPointColumn1 = params['idColumn1'] idPointColumn2 = params['idColumn2'] -# What columns are important from each file? -columnsToAdd1 = params['columnsToAdd1'] -columnsToAdd2 = params['columnsToAdd2'] - -# In which files should we save the data not found? -notFoundFile1 = open(params['path'] + params['notFoundFile1'], 'w') -notFoundFile2 = open(params['path'] + params['notFoundFile2'], 'w') - print("Reading files...") csv.register_dialect('dialect', lineterminator = params['lineterminator'], delimiter=params['delimiter'], quotechar=params['quotechar']) @@ -64,21 +56,13 @@ if not file_exists.is_file(): call(["./resumo_cadastro.sh " + params['path'] + " " + params['date']], shell=True) with open(file2, newline='', encoding='Windows-1252') as f: - #csv_2 = [ i for i in csv.reader(f, 'dialect') ] - #for x in csv.reader(f, 'dialect'): - # print(x) csv_2 = [ i for i in csv.reader(f, 'dialect') ] -# I do not do csv_2.pop(0) because in this file we are not using a header. title2 = csv_2.pop(0) # Having data from both files, I have to merge them. def getDataFromRows(row1, row2): newRow = [] - #for column in columnsToAdd1: - # newRow.append(row1[column]) - #for column in columnsToAdd2: - # newRow.append(row2[column]) for value in row2: newRow.append(value) for value in row1: @@ -96,11 +80,10 @@ def getDataWithEmptyRow(columns, row): result = [] count = 0 hits = 0 -error1 = 0 -error2 = 0 +errors = 0 previous = 0 progress = 0 -const = 50 / len(csv_2) +const = 100 / len(csv_2) print("Preparing data...") @@ -133,48 +116,18 @@ for row2 in csv_2: break if not found: # This guy was in the second file, but not in the first one. Add him, but with null values in the second file. - #print("Error finding field " + row2[idPointColumn2]) newRow = getDataWithEmptyRow(columns1, row2) result.append(newRow) - notFoundFile1.write(row2[idPointColumn2] + '\n') - error2 += 1 + errors += 1 count = 0 const = 50 / len(csv_1) -# Check if there was a point in csv_1 but not in csv_2. This is not useful anymore. It never printed 'found one'. -""" -for row1 in csv_1: - count += 1 - if(count % 10) == 0: - previous = progress - progress = math.floor(count * const) + 50 - if(progress != previous): - print(str(progress) + '% completed.') - if row1[idPointColumn1] != -1: - #print(row1[idPointColumn1]) - found = False - for row2 in csv_2: - if row1[idPointColumn1] == row2[idPointColumn2]: - print('found one') - newRow = getDataFromRows(row1, row2) - result.append(newRow) - found = True - break - if not found: - #print("Error finding field " + row1[idPointColumn1]) - error1 += 1 - -print("Number of rows in file 1 but not in file 2: " + str(error1)) -""" -print("Number of rows in file 2 but not in file 1: " + str(error2)) +print("Number of rows in file 2 but not in file 1: " + str(errors)) print("Saving data to file result.csv...") -with open(params['path'] + params['outputFile'], 'w', newline='') as csvfile: +with open(params['outputFile'], 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow(getDataFromRows(title1, title2)) writer.writerows(result) - -notFoundFile1.close() -notFoundFile2.close()