diff --git a/Pipfile b/Pipfile index 0a2040c890c377b2d72eb488a4178d5d19af3aa4..a71efcf8e3575ce0378264f3106c08c8a011fd90 100644 --- a/Pipfile +++ b/Pipfile @@ -15,7 +15,7 @@ ipython = "*" django = "==1.11.10" django-widget-tweaks = "*" -pandas = "==0.18.1" +pandas = "==0.22" "psycopg2" = "*" xlrd = "*" django-extensions = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 7a905e16f578418a109aed7fdf34b5a572cbb6b9..8356c5a86d8ac8fb955998dd87fc8fe33a9f8baa 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "0b62cd0d5cd72fee71fa1f56dde87cabb220300bca052e3c9006ba8da5edeca6" + "sha256": "36924d8179f463a93a6998ff08f3840a6e3125ae52f0dbf7a2a82a3563d8e337" }, "host-environment-markers": { "implementation_name": "cpython", @@ -36,10 +36,10 @@ }, "django-extensions": { "hashes": [ - "sha256:24c24bbc6ef6dd36fe6b2b7c48d171a9d22fe76895610fe19087af657fa27930", - "sha256:1f424a7f87974c2e2602b8b41cae52eb08105523f0c70320203abf58bcb84404" + "sha256:37a543af370ee3b0721ff50442d33c357dd083e6ea06c5b94a199283b6f9e361", + "sha256:bc9f2946c117bb2f49e5e0633eba783787790ae810ea112fe7fd82fa64de2ff1" ], - "version": "==2.0.5" + "version": "==2.0.6" }, "django-widget-tweaks": { "hashes": [ @@ -77,26 +77,8 @@ "version": "==1.14.2" }, "pandas": { - "hashes": [ - "sha256:2aeebd55027eb1fcb5020ec141696be47fff65fb86c276e46bae42f04b3bfeaf", - "sha256:6f31b4510da92f8beec17fe9ecb3f386984a4b35e1d1dee062b3463f63e70bbc", - "sha256:b7a6ce196452bf9a020074b68c184b174c12a22c285603ceebb09c645cf001d1", - "sha256:42c933501341263194926d00c1039d314039f6fbe763e13d983918d273a0ad68", - "sha256:aa50475fafbc689dead2e9a4e98b96fc43f1190f6661d1daf560f8c05ac26496", - "sha256:9984b284ab6d7672c720ea960f4d19b9dd0bea061c2ccd641b0c20d34ce03f7a", - "sha256:a0af231d6bf20d3f94a4d694bb3cd26c1b330aa4ed124ea99eff49a583ed10ff", - "sha256:9b1a7834e10c5a2afacaae8ba10054dc2ee5ae81eeaecf44d9eaa4d726962817", - "sha256:6621db235422aa48d7513a7f332a7bfc6e9a54b0283bac145cccec7c4c0ccd7d", - "sha256:c39dbc38bc031f099bcfa408a93c801f0141ee49a7d4e0df09cdf9dcf01f27e6", - "sha256:fccbc771a23d51b366182c136cd735cf1642744270fee964f5b1fe9103d66239", - "sha256:80bf0d32432fe588a0e94ff6b216aa5c61ddba2348ca904bda240218f9cbe122", - "sha256:931d25b391eb01c52239a41e2b1c29c8337a6789852ecc0d4ce39ce2491424e6", - "sha256:563720b6302a4e2b513471c16bd7e89db2ae44d3f6b0745896b9c289f3c6b2fb", - "sha256:c850d8c41b5417ba361967d3e2b6119c681b9f0bd5eb77f4c013c46dbf0ebe95", - "sha256:d2e483692c7915916dffd1b83256ea9761b4224c8d45646ceddf48b977ee77b2", - "sha256:de8661d3a71bac8b5100c2a85fdb1b55c9b41534aba7a9671d1130d43ab2de59" - ], - "version": "==0.18.1" + "hashes": [], + "version": "==0.22" }, "psycopg2": { "hashes": [ diff --git a/src/script/base/dataframe_base.py b/src/script/base/dataframe_base.py index 7097a4f38c77c77d83832aebae06d1e7d5d5dfcc..048a9f549f71248e0ec470fc957fd3ba935dcafc 100644 --- a/src/script/base/dataframe_base.py +++ b/src/script/base/dataframe_base.py @@ -3,112 +3,134 @@ import os import pandas as pd import numpy as np from script.utils.situations import * +from script.utils.utils import invert_dict class DataframeHolder: - def __init__(self, dataframe): - self.students = dataframe.groupby('MATR_ALUNO') - self.courses = dataframe.groupby('COD_ATIV_CURRIC') - self.admission = dataframe.groupby(['ANO_INGRESSO', 'SEMESTRE_INGRESSO']) + def __init__(self, dataframe): + self.students = dataframe.groupby('MATR_ALUNO') + self.courses = dataframe.groupby('COD_ATIV_CURRIC') + self.admission = dataframe.groupby(['ANO_INGRESSO', 'SEMESTRE_INGRESSO']) def load_dataframes(cwd='.'): - dataframes = [] - for path, dirs, files in os.walk(cwd): - for f in files: - file_path = path + '/' + f - dh = {'name': f, 'dataframe': None} - if 'csv' in f: - dh['dataframe'] = read_csv(file_path) - if 'xls' in f: - dh['dataframe'] = read_excel(file_path) - - if dh['dataframe'] is not None: - dataframes.append(dh) - - dataframe = fix_dataframes(dataframes) - dh = DataframeHolder(dataframe) - #~ dh.students.aggregate(teste) + dataframes = [] + for path, dirs, files in os.walk(cwd): + for f in files: + file_path = path + '/' + f + dh = {'name': f, 'dataframe': None} + if 'csv' in f: + dh['dataframe'] = read_csv(file_path) + if 'xls' in f: + dh['dataframe'] = read_excel(file_path) + + if dh['dataframe'] is not None: + dataframes.append(dh) + + dataframe = fix_dataframes(dataframes) + dh = DataframeHolder(dataframe) + #~ dh.students.aggregate(teste) # print(dh.students['MEDIA_FINAL'].aggregate(teste)) - return dataframe + return dataframe def read_excel(path, planilha='Planilha1'): - return pd.read_excel(path) + return pd.read_excel(path) def read_csv(path): - return pd.read_csv(path) + return pd.read_csv(path) def fix_dataframes(dataframes): - for df in dataframes: - if df['name'] == 'historico.xls' or df['name'] == 'historico.csv': - history = df['dataframe'] - history.rename(columns={'DESCR_SITUACAO': 'SITUACAO'}, inplace=True) - if df['name'] == 'matricula.xls' or df['name'] == 'matricula.csv': - register = df['dataframe'] + for df in dataframes: + if df['name'] == 'historico.xls' or df['name'] == 'historico.csv': + history = df['dataframe'] + history.rename(columns={'DESCR_SITUACAO': 'SITUACAO'}, inplace=True) + if df['name'] == 'matricula.xls' or df['name'] == 'matricula.csv': + register = df['dataframe'] - #~ clean_history(history) - clean_register(register) - #~ df.dropna(axis=0, how='all') - history["MEDIA_FINAL"] = pd.to_numeric(history["MEDIA_FINAL"], errors='coerce') - history = history[np.isfinite(history['MEDIA_FINAL'])] + #~ clean_history(history) + clean_register(register) + #~ df.dropna(axis=0, how='all') + history["MEDIA_FINAL"] = pd.to_numeric(history["MEDIA_FINAL"], errors='coerce') + history = history[np.isfinite(history['MEDIA_FINAL'])] + # inner = exste nos dois relatórios, é o que a gente quer + # o que fazer com quem não está em um dos dois é um questão em aberto + merged = pd.merge(history, register, how='inner', on=['MATR_ALUNO']) + merged = merged.rename(index=str, columns={"ANO_INGRESSO_x": "ANO_INGRESSO", "SEMESTRE_INGRESSO_x": "SEMESTRE_INGRESSO", "FORMA_INGRESSO_x": "FORMA_INGRESSO"}) - merged = pd.merge(history, register, how='outer', on=['MATR_ALUNO']) - merged = merged.rename(index=str, columns={"ANO_INGRESSO_x": "ANO_INGRESSO", "SEMESTRE_INGRESSO_x": "SEMESTRE_INGRESSO", "FORMA_INGRESSO_x": "FORMA_INGRESSO"}) + fix_situation(merged) + fix_admission(merged) + fix_evasion(merged) + fix_carga(merged) - fix_situation(merged) - fix_admission(merged) - fix_evasion(merged) - fix_carga(merged) - - return merged + return merged def clean_history(df): - df.drop(['ID_NOTA', 'CONCEITO', 'ID_LOCAL_DISPENSA', 'SITUACAO_CURRICULO', - 'ID_CURSO_ALUNO', 'ID_VERSAO_CURSO', 'ID_CURRIC_ALUNO', - 'ID_ATIV_CURRIC', 'SITUACAO_ITEM', 'ID_ESTRUTURA_CUR', 'NUM_VERSAO' - ], axis=1, inplace=True) + print(df.columns) + + drop_columns = ['ID_NOTA', 'CONCEITO', 'ID_LOCAL_DISPENSA', 'SITUACAO_CURRICULO', + 'ID_CURSO_ALUNO', 'ID_VERSAO_CURSO', 'ID_CURRIC_ALUNO', + 'ID_ATIV_CURRIC', 'SITUACAO_ITEM', 'ID_ESTRUTURA_CUR' + ] + + drop_columns = [x for x in drop_columns if x in df.columns] + + df.drop(drop_columns, axis=1, inplace=True) + df['PERIODO'] = df['PERIODO'].str.split('o').str[0] def clean_register(df): - df_split = df['PERIODO_INGRESSO'].str.split('/') - df['ANO_INGRESSO'] = df_split.str[0] - df['SEMESTRE_INGRESSO'] = df_split.str[1].str.split('o').str[0] - df_split = df['PERIODO_EVASAO'].str.split('/') - df['ANO_EVASAO'] = df_split.str[0] - df['SEMESTRE_EVASAO'] = df_split.str[1].str.split('o').str[0] + df_split = df['PERIODO_INGRESSO'].str.split('/') + df['ANO_INGRESSO'] = df_split.str[0] + df['SEMESTRE_INGRESSO'] = df_split.str[1].str.split('o').str[0] + df_split = df['PERIODO_EVASAO'].str.split('/') + df['ANO_EVASAO'] = df_split.str[0] + df['SEMESTRE_EVASAO'] = df_split.str[1].str.split('o').str[0] + + drop_columns = ['ID_PESSOA', 'NOME_PESSOA', 'DT_NASCIMENTO', 'NOME_UNIDADE','COD_CURSO', + 'PERIODO_INGRESSO', 'PERIODO_EVASAO'] + + drop_columns = [x for x in drop_columns if x in df.columns] + + df.drop(drop_columns, axis=1, inplace=True) - df.drop(['ID_PESSOA', 'NOME_PESSOA', 'DT_NASCIMENTO', 'NOME_UNIDADE','COD_CURSO', 'PERIODO_INGRESSO', 'PERIODO_EVASAO'],axis=1, inplace=True) + +def get_situation(d, default): + def getter(x): + return invert_dict(d).get(x, default) + return getter def fix_situation(df): - for situation in Situation.SITUATIONS: - df.loc[df.SITUACAO == situation[1], 'SITUACAO'] = situation[0] + df.rename(columns={"SITUACAO": "SITUACAO2"}, inplace=True) + + df['SITUACAO'] = df.SITUACAO2.apply(get_situation(Situation.SITUATIONS, Situation.SIT_OUTROS)) + + df.drop(['SITUACAO2'], axis=1, inplace=True) def fix_admission(df): - for adm in AdmissionType.ADMISSION_FORM: - df.loc[df.FORMA_INGRESSO == adm[1], 'FORMA_INGRESSO'] = adm[0] + df.rename(columns={'FORMA_INGRESSO': 'FORMA_INGRESSO2'}, inplace=True) + + df['FORMA_INGRESSO'] = df.FORMA_INGRESSO2.apply(get_situation(AdmissionType.ADMISSION_FORM, + AdmissionType.AT_OUTROS)) + + df.drop(['FORMA_INGRESSO2'], axis=1, inplace=True) def fix_carga(df): - df["CH_TOTAL"] = df["CH_TEORICA"]+df["CH_PRATICA"] + df["CH_TOTAL"] = df["CH_TEORICA"]+df["CH_PRATICA"] + def fix_evasion(df): - evasionForms = [x[1] for x in EvasionForm.EVASION_FORM] - df.loc[~df.FORMA_EVASAO.isin(evasionForms), 'FORMA_EVASAO'] = 100 - for evasion in EvasionForm.EVASION_FORM: - #~ df.loc[df.FORMA_EVASAO.str.contains(evasion[1]).fillna(1.0), 'FORMA_EVASAO'] = evasion[0] - df.loc[df.FORMA_EVASAO == evasion[1], 'FORMA_EVASAO'] = evasion[0] - - #~ if(evasion[0] == 100): - #~ for x in df.FORMA_EVASAO.str.contains(evasion[1]).fillna(False): - #~ if(x != 0.0): - #~ print(x) - #~ print(df.FORMA_EVASAO.str.contains(evasion[1]).fillna(5)) - #~ print(df[['MATR_ALUNO','FORMA_EVASAO']]) + df.rename(columns={'FORMA_EVASAO': 'FORMA_EVASAO2'}, inplace=True) + + df['FORMA_EVASAO'] = df.FORMA_EVASAO2.apply(get_situation(EvasionForm.EVASION_FORM, + EvasionForm.EF_OUTROS)) + + df.drop(['FORMA_EVASAO2'], axis=1, inplace=True) diff --git a/src/script/main.py b/src/script/main.py index fb45fa4376ba1c55fd62aa39c2787af16ba594c1..a4f8ea7337d9fc7569f813de2e07f1a28c5b8eb1 100644 --- a/src/script/main.py +++ b/src/script/main.py @@ -31,6 +31,7 @@ def main(): start_time_exec = time.time() dataframe = load_dataframes(os.getcwd() + '/script/' + 'base') + build_cache(dataframe) cpu_time = timedelta(seconds=round(time.clock() - start_time)) analises_disciplinas(dataframe) diff --git a/src/script/utils/situations.py b/src/script/utils/situations.py index d44993528dbff0fc440702eacae7b1cfc44efaf1..3f4f1ef9859176c0b48af2c5d8580c4cf6872546 100644 --- a/src/script/utils/situations.py +++ b/src/script/utils/situations.py @@ -74,6 +74,7 @@ class EvasionForm: # orientaçao: verificar se media_final é maior que 100 se sim atribua 0 se nao # atribua media_final + class Situation: SIT_DESCONHECIDA = 0 diff --git a/src/script/utils/utils.py b/src/script/utils/utils.py index 791237926af8d384ae0a93ed082b2376084bc1a6..6be89c644e15c3e93ba473dcdecc8a852b06ba2b 100644 --- a/src/script/utils/utils.py +++ b/src/script/utils/utils.py @@ -10,6 +10,10 @@ except: DEBUG = True +def invert_dict(d): + return {v: k for k, v in d} + + def build_path(path): if not os.path.exists(path): os.mkdir(path)