From 8e9349742937d360d93481b0bee8ad683462cd25 Mon Sep 17 00:00:00 2001 From: Cristian Weiland <cw14@inf.ufpr.br> Date: Tue, 31 Jan 2017 09:22:02 -0200 Subject: [PATCH] Add base scripts and config files Signed-off-by: Cristian Weiland <cw14@inf.ufpr.br> --- .gitignore | 1 + config.json.example | 16 +++ configs/.config-2016-01.json.swp | Bin 0 -> 12288 bytes configs/config-2016-01.json | 16 +++ configs/config-2016-02.json | 16 +++ configs/config-2016-03.json | 16 +++ configs/config-2016-04.json | 16 +++ configs/config-2016-05.json | 16 +++ configs/config-2016-06.json | 16 +++ configs/config-2016-07.json | 16 +++ configs/config-2016-08.json | 16 +++ configs/config-2016-09.json | 16 +++ configs/config-2016-10.json | 16 +++ configs/config-2016-11.json | 16 +++ configs/generate_config.sh | 6 ++ merge_files_es.py | 180 +++++++++++++++++++++++++++++++ resumo_cadastro.sh | 42 ++++++++ 17 files changed, 421 insertions(+) create mode 100644 .gitignore create mode 100644 config.json.example create mode 100644 configs/.config-2016-01.json.swp create mode 100644 configs/config-2016-01.json create mode 100644 configs/config-2016-02.json create mode 100644 configs/config-2016-03.json create mode 100644 configs/config-2016-04.json create mode 100644 configs/config-2016-05.json create mode 100644 configs/config-2016-06.json create mode 100644 configs/config-2016-07.json create mode 100644 configs/config-2016-08.json create mode 100644 configs/config-2016-09.json create mode 100644 configs/config-2016-10.json create mode 100644 configs/config-2016-11.json create mode 100644 configs/generate_config.sh create mode 100755 merge_files_es.py create mode 100755 resumo_cadastro.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..751d80a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +Dados_Servidores diff --git a/config.json.example b/config.json.example new file mode 100644 index 0000000..f804ba6 --- /dev/null +++ b/config.json.example @@ -0,0 +1,16 @@ +{ + "comment": "this file's documentation is in merge_csv_files.py" + , "path": "Dados_Servidores/2016-10/" + , "date": "20161031" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "remuneracao+ufpr.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/.config-2016-01.json.swp b/configs/.config-2016-01.json.swp new file mode 100644 index 0000000000000000000000000000000000000000..c1c86e6d326d89b4dc2794769f9da0f1717c63de GIT binary patch literal 12288 zcmeI2&1w}f6vtC{B3047j1gSj-uY;?(4AC@;>SY0BKR?+lUy5_o6OB*svzns_y~Os zm*P{n@flpXa3P+gcP@@db!+(__!H)w|4DLwyBTITdwBCsvN_&h6lWQ`Fn|B(d%DX$ zY%*3k-FhudC><65>U-N8Q}2Xn8{w3xv`8y!ruuoiSl5ayzq*#?;}@;{>(NJD3pEe` zfdPS4wqE2}NL}I=lk?{$gDM_C00ck)1V8`;KmY_l00fRV0pF~!cQoRO#hCO7AjZmX z>*5g$2!H?xfB*=900@8p2!H?xfB*=9z<)?UNXEXdGIskkU7lY4`}hAJXBhiT`b0V) zdD1h|4bo-ODbhFU`9k_gIv~9y-6e&+Zjlfm00JNY0w4eaAOHd&00JNY0>_iU8<tRv z5^k*Dv7M1Sx>g0>iivU>GG#ILuRQ;gF&`T9fzfup>3rBPxSVRc+K;Ptf4Nhwjq=LP zv=QFAV0vPfMX72%(^QUmI{qYGa^6|5sy%U-<V$m9>uzS+2lkqj1y8n;$4~p~{#wdo zsTd_2qvXocq2+dLkBwR1*Joo-(PyWT4jsVDZ6QVLoh>KR#+4JJ=baji7re2Xm-p1H xGs=ld*h5PgMhYLvgn#F=d_AWTc_aK@82Gx7wk_`~x38robm`&;`;@cS>?h8Yr|<v( literal 0 HcmV?d00001 diff --git a/configs/config-2016-01.json b/configs/config-2016-01.json new file mode 100644 index 0000000..79f65e9 --- /dev/null +++ b/configs/config-2016-01.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-01/" + , "date": "20160131" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-02.json b/configs/config-2016-02.json new file mode 100644 index 0000000..f5b11af --- /dev/null +++ b/configs/config-2016-02.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-02/" + , "date": "20160229" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-03.json b/configs/config-2016-03.json new file mode 100644 index 0000000..08db9b1 --- /dev/null +++ b/configs/config-2016-03.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-03/" + , "date": "20160331" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-04.json b/configs/config-2016-04.json new file mode 100644 index 0000000..b1c6c2e --- /dev/null +++ b/configs/config-2016-04.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-04/" + , "date": "20160430" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-05.json b/configs/config-2016-05.json new file mode 100644 index 0000000..7fd76de --- /dev/null +++ b/configs/config-2016-05.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-05/" + , "date": "20160531" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-06.json b/configs/config-2016-06.json new file mode 100644 index 0000000..0d8123e --- /dev/null +++ b/configs/config-2016-06.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-06/" + , "date": "20160630" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-07.json b/configs/config-2016-07.json new file mode 100644 index 0000000..eb70d53 --- /dev/null +++ b/configs/config-2016-07.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-07/" + , "date": "20160731" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-08.json b/configs/config-2016-08.json new file mode 100644 index 0000000..a3cd033 --- /dev/null +++ b/configs/config-2016-08.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-08/" + , "date": "20160831" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-09.json b/configs/config-2016-09.json new file mode 100644 index 0000000..ea230f3 --- /dev/null +++ b/configs/config-2016-09.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-09/" + , "date": "20160930" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-10.json b/configs/config-2016-10.json new file mode 100644 index 0000000..c6e61ed --- /dev/null +++ b/configs/config-2016-10.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-10/" + , "date": "20161031" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/config-2016-11.json b/configs/config-2016-11.json new file mode 100644 index 0000000..9db5dce --- /dev/null +++ b/configs/config-2016-11.json @@ -0,0 +1,16 @@ +{ + "path": "Dados_Servidores/2016-11/" + , "date": "20161130" + , "file1" : "_Remuneracao.csv" + , "file2" : "_Cadastro_Ufpr_Unique.csv" + , "idColumn1" : 2 + , "idColumn2" : 0 + , "columnsToAdd1" : [2, 4, 5] + , "columnsToAdd2" : [] + , "quotechar": "\"" + , "delimiter": "\t" + , "lineterminator": "\n" + , "outputFile": "result.csv" + , "notFoundFile1": "not_found_1.txt" + , "notFoundFile2": "not_found_2.txt" +} diff --git a/configs/generate_config.sh b/configs/generate_config.sh new file mode 100644 index 0000000..22810d5 --- /dev/null +++ b/configs/generate_config.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +read "What is the script's date? [YYYY-MM-DD]:" date +case $yn in | YES ) + +esac diff --git a/merge_files_es.py b/merge_files_es.py new file mode 100755 index 0000000..96a4e41 --- /dev/null +++ b/merge_files_es.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +""" +Versão feita visando inserção no ElasticSearch. +Isso significa que eu vou escrever TODOS os dados que eu achar do segundo arquivo, mas do primeiro arquivo só escrevo os que estiverem no segundo. + + +Recebe como parâmetro um arquivo de configuração, no mesmo formato que o exemplo. + +Documentação do config.json.example: +file1 and file2 are the files that will be merged. +The variables that end with number 1 represent something in the first file and the ones that one with 2 represent the same thing in the second file. +idColumn represent the common column in both files. +columnsToAdd1 are the ids of the columns that will be printed in the output file. + We might want to add columns 4, 13, 16 and 22 in columnsToAdd2, but this does not work right now. +delimiter is the CSV's delimiter. +lineterminator is the CSV's line terminator. +outputFile is the name of the output file. +notFoundFile is the name of a file with errors: they represent columns that were in one file but not in the other. In this case, notFoundFile1 are the columns that are in the second file but not in t he first file. + + +Nesse momento, ele tá sendo usado pra unir dois arquivos: um relatório de Remuneração (ex: 201610_Remuneracao.csv) com um arquivo que contém o ID do portal das pessoas da UFPR. +Esse segundo arquivo pode ser obtido a partir da filtragem do arquivo de Cadastros (ex: 201610_Cadastros.csv). A filtragem é feita com o resumo_cadastro.sh. +""" + +import sys, csv, json, math, subprocess +from pathlib import Path +from subprocess import call + +if len(sys.argv) != 2: + print("Usage: " + sys.argv[0] + " <config.json>") + sys.exit() + +with open(sys.argv[1]) as f: + params = json.load(f) + +# Which files should be merged? +file1 = params['path'] + params['date'] + params['file1'] +file2 = params['path'] + params['date'] + params['file2'] + +# Which column in each file contains the common column? +idPointColumn1 = params['idColumn1'] +idPointColumn2 = params['idColumn2'] + +# What columns are important from each file? +columnsToAdd1 = params['columnsToAdd1'] +columnsToAdd2 = params['columnsToAdd2'] + +# In which files should we save the data not found? +notFoundFile1 = open(params['path'] + params['notFoundFile1'], 'w') +notFoundFile2 = open(params['path'] + params['notFoundFile2'], 'w') + +print("Reading files...") + +csv.register_dialect('dialect', lineterminator = params['lineterminator'], delimiter=params['delimiter'], quotechar=params['quotechar']) + +with open(file1, newline='', encoding='Windows-1252') as f: + csv_1 = [ i for i in csv.reader(f, 'dialect') ] +title1 = csv_1.pop(0) + +file_exists = Path(file2) +if not file_exists.is_file(): + print("File2 does not exist. Calling script to create it...") + call(["./resumo_cadastro.sh " + params['path'] + " " + params['date']], shell=True) + +with open(file2, newline='', encoding='Windows-1252') as f: + #csv_2 = [ i for i in csv.reader(f, 'dialect') ] + #for x in csv.reader(f, 'dialect'): + # print(x) + csv_2 = [ i for i in csv.reader(f, 'dialect') ] +# I do not do csv_2.pop(0) because in this file we are not using a header. +title2 = csv_2.pop(0) + +# Having data from both files, I have to merge them. + +def getDataFromRows(row1, row2): + newRow = [] + #for column in columnsToAdd1: + # newRow.append(row1[column]) + #for column in columnsToAdd2: + # newRow.append(row2[column]) + for value in row2: + newRow.append(value) + for value in row1: + newRow.append(value) + return newRow + +def getDataWithEmptyRow(columns, row): + newRow = [] + for value in row: + newRow.append(value) + for i in range(0, columns): + newRow.append('') + return newRow + +result = [] +count = 0 +hits = 0 +error1 = 0 +error2 = 0 +previous = 0 +progress = 0 +const = 50 / len(csv_2) + +print("Preparing data...") + +# Get number of columns in file 1 +columns1 = len(csv_1[0]) + +# Separate id_point from useless data in file 2 and append points in result array. +# This for takes about 50% of the total time. + +for row2 in csv_2: + count += 1 + if(count % 10) == 0: + previous = progress + progress = math.floor(count * const) + if(progress != previous): + print(str(progress) + '% completed.') + #print(count) + # I have IdPoint. Find the correspondent one in the other csv + # and add data from file 2 to file 2. + found = False + for row1 in csv_1: + if row1[idPointColumn1] == row2[idPointColumn2]: + newRow = getDataFromRows(row1, row2) + # To make sure we wont get the same point twice. + row1[idPointColumn1] = -1; + row2[idPointColumn2] = -1; + result.append(newRow) + found = True + hits += 1 + break + if not found: + # This guy was in the second file, but not in the first one. Add him, but with null values in the second file. + #print("Error finding field " + row2[idPointColumn2]) + newRow = getDataWithEmptyRow(columns1, row2) + result.append(newRow) + notFoundFile1.write(row2[idPointColumn2] + '\n') + error2 += 1 + +count = 0 +const = 50 / len(csv_1) + +# Check if there was a point in csv_1 but not in csv_2. This is not useful anymore. It never printed 'found one'. +""" +for row1 in csv_1: + count += 1 + if(count % 10) == 0: + previous = progress + progress = math.floor(count * const) + 50 + if(progress != previous): + print(str(progress) + '% completed.') + if row1[idPointColumn1] != -1: + #print(row1[idPointColumn1]) + found = False + for row2 in csv_2: + if row1[idPointColumn1] == row2[idPointColumn2]: + print('found one') + newRow = getDataFromRows(row1, row2) + result.append(newRow) + found = True + break + if not found: + #print("Error finding field " + row1[idPointColumn1]) + error1 += 1 + +print("Number of rows in file 1 but not in file 2: " + str(error1)) +""" +print("Number of rows in file 2 but not in file 1: " + str(error2)) + +print("Saving data to file result.csv...") + +with open(params['path'] + params['outputFile'], 'w', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter='\t') + writer.writerow(getDataFromRows(title1, title2)) + writer.writerows(result) + +notFoundFile1.close() +notFoundFile2.close() diff --git a/resumo_cadastro.sh b/resumo_cadastro.sh new file mode 100755 index 0000000..d79d524 --- /dev/null +++ b/resumo_cadastro.sh @@ -0,0 +1,42 @@ +# Setembro 2016 +path=$1 +date=$2 + +echo "Processing data with args = ${path} and ${date}" + +input="${path}${date}_Cadastro.csv" +output="${path}${date}_Cadastro_Ufpr_Unique.csv" + +# Outubro 2016 +# input="Dados_Servidores/2016-10/20161031_Cadastro.csv" +# output="Dados_Servidores/2016-10/cadastro_2016-10-31_filters_ufpr_unique.csv" + +# For now, this does not work. It does not create a properly CSV. +columns="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42" +#columns="1,2,23,24" +#columns="1" + +#cat $input | grep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | cut -f $columns | sort | uniq > $output + +# The same as above, but wrap fields in double quotes. +#cat $input | grep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | cut -f $columns | sort | uniq | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' > $output +# Sed wraps fields in double quotes. Grep removes everyone that does not work in UFPR. Cut selects the important columns. Uniq removes repeated values. Tr removes null characters (ctrl + @). +#cat $input | grep --binary-files=text "UNIVERSIDADE FEDERAL DO PARANA" | cut -f $columns | sort | uniq | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output + +# Parece funcionar, mas pra todas as Universidades +#cat $input | egrep --binary-files=text "(UNIVERSIDADE FED*|Id_SERVIDOR_PORTAL NOME)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output + +#cat $input | egrep --binary-files=text "(UNIVERSIDADE FEDERAL DO PARANA|Id_SERVIDOR_PORTAL NOME)" | sed -e 's/"//g' -e 's/^\|$/"/g' -e 's/\t/"\t"/g' | tr -d '\000' > $output +#Mesmo que de cima, mas sem wrapar com " +cat $input | egrep --binary-files=text "(UNIVERSIDADE FEDERAL DO PARANA|Id_SERVIDOR_PORTAL NOME)" | tr -d '\000' > $output + +# +--------+---------------------+--------------------------------+ +# | Column | Contains | Example | +# +--------+---------------------+--------------------------------+ +# | 1 | Id_SERVIDOR_PORTAL | 1000021 | +# | 2 | NOME | MARIA AUREA DOS SANTOS RIBEIRO | +# | 24 | COD_ORG_LOTACAO | 26241 | +# | 25 | ORG_LOTACAO | UNIVERSIDADE FEDERAL DO PARANA | +# | 29 | UORG_EXERCICIO | BL - DEPARTAMENTO DE GENETICA | Parece que é a coluna 23 na verdade... +# | 33 | JORNADA_DE_TRABALHO | 40 HORAS SEMANAIS | +# +--------+---------------------+--------------------------------+ -- GitLab