Commit 5c0a23c6 authored by Henrique Varella Ehrenfried's avatar Henrique Varella Ehrenfried
Browse files

Initial commit

parents
env/
__pycache__
*.pyc
pairing/
[submodule "mapping_protocols"]
path = mapping_protocols
url = git@gitlab.c3sl.ufpr.br:simcaq/mapping_protocols.git
[MASTER]
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code
extension-pkg-whitelist=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint.
jobs=1
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# Specify a configuration file.
#rcfile=
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
confidence=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=
[REPORTS]
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio).You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text
# Tells whether to display a full report or only the messages
reports=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
# A regular expression matching the name of dummy variables (i.e. expectedly
# not used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,future.builtins
[BASIC]
# Naming hint for argument names
argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct argument names
argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Naming hint for attribute names
attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct attribute names
attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Naming hint for class attribute names
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Regular expression matching correct class attribute names
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Naming hint for class names
class-name-hint=[A-Z_][a-zA-Z0-9]+$
# Regular expression matching correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Naming hint for constant names
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Regular expression matching correct constant names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming hint for function names
function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct function names
function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_
# Include a hint for the correct naming format with invalid-name
include-naming-hint=no
# Naming hint for inline iteration names
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
# Regular expression matching correct inline iteration names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Naming hint for method names
method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct method names
method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Naming hint for module names
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression matching correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
property-classes=abc.abstractproperty
# Naming hint for variable names
variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct variable names
variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=100
# Maximum number of lines in a module
max-module-lines=1000
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,dict-separator
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[SIMILARITIES]
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
# Minimum lines number of a similarity.
min-similarity-lines=4
[LOGGING]
# Logging modules to check that the string format arguments are in logging
# function parameter format
logging-modules=logging
[SPELLING]
# Spelling dictionary name. Available dictionaries: none. To make it working
# install python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to indicated private dictionary in
# --spelling-private-dict-file option instead of raising a message.
spelling-store-unknown-words=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=Table.*
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis. It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[DESIGN]
# Maximum number of arguments for function / method
max-args=5
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in a if statement
max-bool-expr=5
# Maximum number of branch for function / method body
max-branches=12
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of statements in function / method body
max-statements=50
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
[IMPORTS]
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=optparse,tkinter.tix
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception
# Administrador de base de dados SimCAQ/SMPPIR #
Esse repositório implementa a classe DatabaseTable e funções para verificar pareamento entre
diferentes anos inseridos no banco de dados. A ferramenta é desenvolvida em Python 3, e usa
como base arquivos de mapeamento em formato CSV.
Para a utilização a partir da linha de comando, a CLI manage.py pode ser utilizada sem
que se invoque manualmente as funções a partir da linha de comando Python.
## Requisitos ##
O utilitário foi desenvolvido em Python 3 usando a biblioteca SQLAlchemy com vistas ao banco
de dados MonetDB. Versões futuras podem ter modificações visando a compatibilidade com outros
bancos de dados, aproveitando as capacidades da biblioteca base.
Para a instalação dos requisitos conforme usados durante o desenvolvimento, o arquivo
requirements.txt pode ser usado como base (Recomenda-se o uso de um ambiente virtual).
```bash
(env) $ pip install -r requirements.txt
```
A CLI depende do módulo manage.py. Demais dependências serão listadas a seguir.
### Requisitos para a interface com a base de dados ###
* pymonetdb
* SQLAlchemy
* sqlalchemy-monetdb
### Requisitos para geração de pareamentos ###
* numpy
* pandas
* xlrd
* XlsxWriter
## Interface de linha de comando ##
A invocação da CLI utiliza o padrão do pacote manage.py, que é:
```bash
$ python manage.py [commando] [argumentos posicionais] [argumentos opcionais com valor]
```
Os comandos já implementados são:
* create: Cria a tabela conforme definido no protocolo de mapeamento.
```bash
$ python manage.py create <nome da tabela>
```
O único argumento usado é o nome da tabela. O script procurará por um protocolo de
mapeamento com o mesmo nome para a busca do esquema das colunas.
* insert: insere um arquivo de dados em formato CSV ou similar em uma tabela existente.
```bash
$ python manage.py insert <caminho para o arquivo> <nome da tabela> <ano> [--sep separador] [--null valor_nulo]
```
O caminho para o arquivo deve ser absoluto. A tabela utilizada deve existir e estar
sincronizada com o protocolo de mapeamento correspondente. O separador padrão utilizado
é ponto e vírgula (';'); caso outros separadores sejam utilizados pelo arquivo fonte,
devem ser especificados com --sep (por exemplo --sep \\| para pipe). O valor nulo padrão
é string vazia. Caso outro valor seja usado, deve ser especificado com --null.
* drop: derruba uma tabela do banco de dados.
```bash
$ python manage.py drop <nome da tabela>
```
O comando não contorna chaves estrangeiras que apontem para a tabela, e o banco de dados
pode retornar um erro caso exista alguma.
* remap: sincroniza uma tabela com o protocolo de mapeamento.
```bash
$ python manage.py remap <nome da tabela>
```
Esse comando deve ser utilizado sempre que um protocolo de mapeamento for atualizado.
O remapeamento permite a criação de novas colunas, derrubada de colunas existentes,
renomeamento de colunas e mudança de tipo. Dependendo do tamanho da tabela, o uso de
memória primária pode ser intenso.
* generate_pairing_report: gera relatórios de pareamento para comparação de dados ano
a ano.
```bash
$ python manage.py generate_pairing_report [--output xlsx|csv]
```
Os relatórios são criados na pasta pairing. Caso o formato não seja especificado,
csv será utilizado (um arquivo será criado para cada tabela). Caso xlsx seja o formato
utilizado, um arquivo será criado com todas as tabelas separadas em diferentes planilhas.
* generate_backup: Cria/Atualiza o arquivo monitorado para o backup.
```bash
$ python manage.py generate_backup
```
O arquivo é criado ou atualizado na máquina onde o banco de dados da produção está,
o procedimento de backup da equipe de infraestrutura o monitora para realizar o procedimento.
\ No newline at end of file
'''Database manipulation actions - these can be used as models for other modules.'''
import logging
from sqlalchemy import create_engine, MetaData
from database.database_table import gen_data_table, gen_temporary, copy_to_temporary
from mapping_functions import generate_pairing_xlsx, generate_pairing_csv
import settings
ENGINE = create_engine(settings.DATABASE_URI, echo=settings.ECHO)
META = MetaData(bind=ENGINE)
logging.basicConfig(format = settings.LOGGING_FORMAT)
database_table_logger = logging.getLogger('database.database_table')
database_table_logger.setLevel(settings.LOGGING_LEVEL)
sqlalchemy_logger = logging.getLogger('sqlalchemy.engine')
sqlalchemy_logger.setLevel(settings.LOGGING_LEVEL)
def temporary_data(connection, file_name, table, year, offset=2, sep=';', null=''):
header = open(file_name, encoding="ISO-8859-9").readline()
header = header.split(sep)
columns = table.mount_original_columns(header, year)
ttable = gen_temporary('t_' + table.name, META, *columns)
table.set_temporary_primary_keys(ttable, year)
ttable.create(bind=connection)
copy_to_temporary(connection, file_name, ttable, offset, sep, null)
return ttable
def insert(file_name, table, year, offset=2, sep=';', null=''):
'''Inserts contents of csv in file_name in table using year as index for mapping'''
table = gen_data_table(table, META)
with ENGINE.connect() as connection:
trans = connection.begin()
ttable = temporary_data(connection, file_name, table, year, offset, sep, null)
table.insert_from_temporary(connection, ttable, year)
trans.commit()
def create(table):
'''Creates table from mapping_protocol metadata'''
table = gen_data_table(table, META)
table.create()
def drop(table):
'''Drops table'''
table = gen_data_table(table, META)
table.drop()
def remap(table):
'''Applies change made in mapping protocols to database'''
table = gen_data_table(table, META)
table.remap()
def generate_pairing_report(output='csv'):
'''Generates the pairing report for a given table'''
if output == 'csv':
generate_pairing_csv(ENGINE)
elif output == 'xlsx':
generate_pairing_xlsx(ENGINE)
else:
print('Unsuported output type "{}"'.format(output))
def update_from_file(csv_file, table, year, columns=None, target_list=None,
offset=2, sep=';', null=''):
'''Updates table columns from an input csv file'''
table = gen_data_table(table, META)
if columns is None:
columns = []
columns = columns + table.columns_from_targets(target_list)
with ENGINE.connect() as connection:
trans = connection.begin()
ttable = temporary_data(connection, csv_file, table, year, offset, sep, null)
table.update_from_temporary(connection, ttable, year, columns)
trans.commit()
This diff is collapsed.
''' Routines related to column dictionary generation.
Names comonly used:
- original columns: columns as they are named in the original database;
- target columns: columns as named internaly in project;