Skip to content

Commit

Permalink
Further cleanup
Browse files Browse the repository at this point in the history
Merge pull request #170 from openfisca/cleanup
  • Loading branch information
Mauko Quiroga authored Apr 29, 2019
2 parents b3ac645 + 518eb17 commit cd1a717
Show file tree
Hide file tree
Showing 19 changed files with 74 additions and 181 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Changelog

### 0.8.4 [1679](https://github.com/openfisca/openfisca-france-data/pull/169)
## 0.9.0 [170](https://github.com/openfisca/openfisca-france-data/pull/170)

- Further cleanup
- Deprecation of `new_simulation_from_array_dict`

### 0.8.4 [169](https://github.com/openfisca/openfisca-france-data/pull/169)

- Cleanup tests

Expand Down
4 changes: 2 additions & 2 deletions openfisca_france_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
import openfisca_france

# Load input variables and output variables into entities
from .model import common, survey_variables, id_variables # noqa analysis:ignore
from .model.base import * # noqa analysis:ignore
from openfisca_france_data.model import common, survey_variables, id_variables # noqa analysis:ignore
from openfisca_france_data.model.base import * # noqa analysis:ignore


log = logging.getLogger(__name__)
Expand Down
8 changes: 3 additions & 5 deletions openfisca_france_data/assets/zone_apl_data/codeAplReader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-


import csv, pickle
import pickle
import csv

if __name__ == '__main__':

Expand All @@ -11,9 +11,7 @@
code = csv.reader(open(fileName), delimiter = ";")

for row in code:
codeDict.update({row[2]:(row[1],row[4])})

# print(codeDict['75017'])
codeDict.update({row[2]: (row[1], row[4])})

outputFile = open("code_apl", 'wb')
pickle.dump(codeDict, outputFile)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# -*- coding: utf-8 -*-


import pickle
from pandas import read_csv, DataFrame
from pandas import read_csv
import numpy as np


Expand All @@ -20,7 +19,6 @@
#% TU99 0 à 8
#% zone
#

grouped_5 = Z.groupby(['TU99','TAU99','REG','POL99','Zone'], as_index=False)
pop = grouped_5['Pop_mun_2006'].aggregate(np.sum)

Expand All @@ -33,24 +31,25 @@
res['zone2'] = 0
res['zone3'] = 0
print(res)

print(pop.Pop_mun_2006[pop['Zone']==1])
print(pop.Pop_mun_2006[pop['Zone'] == 1])

res['zone1'] = res['zone1'] + pop.Pop_mun_2006[pop['Zone']==1]
res['zone2'] = res['zone2'] + pop.Pop_mun_2006[pop['Zone']==2]
res['zone3'] = res['zone3'] + pop.Pop_mun_2006[pop['Zone']==3]

print(res.to_string())

for col in ('zone1','zone2','zone3'):
for col in ('zone1', 'zone2', 'zone3'):
res[col][np.isnan(res[col])] = 0

print(res.to_string())
res2 = res.groupby(['TU99','TAU99', 'REG','POL99'])
res2 = res.groupby(['TU99', 'TAU99', 'REG', 'POL99'])

final = res2.agg({'zone1': np.sum,
'zone2': np.sum,
'zone3': np.sum})
final = res2.agg({
'zone1': np.sum,
'zone2': np.sum,
'zone3': np.sum
})
final['total'] = final['zone1'] + final['zone2'] + final['zone3']
final['proba_zone1'] = final['zone1']/final['total']
final['proba_zone2'] = final['zone2']/final['total']
Expand Down
1 change: 0 additions & 1 deletion openfisca_france_data/erfs/input_data_builder/run_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,3 @@ def run_all(year = None, check = False):
logging.basicConfig(level = logging.INFO, filename = 'run_all.log', filemode = 'w')
run_all(year = 2009, check = False)
log.info("Script finished after {}".format(time.time() - start))
print(time.time() - start)
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ def invalide(temporary_store = None, year = None):
# # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","invalide","alt")] <- foy_inv_pac
log.info(u" 1.3 : enfants invalides et en garde alternée (variables inv et alt)")
pacIndiv = temporary_store['pacIndiv_{}'.format(year)]
# print(pacIndiv.type_pac.value_counts())
log.info(pacIndiv.type_pac.value_counts())

foy_inv_pac = invalides[['noindiv', 'invalide']][~(invalides.quifoy.isin([0, 1]))].copy()
Expand Down
18 changes: 0 additions & 18 deletions openfisca_france_data/erfs/input_data_builder/step_08_final.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,29 +260,11 @@ def final(temporary_store = None, year = None, check = True):
log.info('{} sali nuls'.format(len(final2[final2['sali'].isnull()])))
log.info("{} individus d'âges nuls".format(len(final2[final2.age.isnull()])))
log.info("longueur de final2 avant purge : {}".format(len(final2)))
# columns_w_nan = []
# for col in final2.columns:
# if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all():
# columns_w_nan.append(col)
# print(columns_w_nan)
log.info('check doublons : {}'.format(len(final2[final2.duplicated(['noindiv'])])))
log.info("{}".format(final2.age.isnull().sum()))

print_id(final2)

# # var <- names(foyer)
# #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si', 'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da')
# #a2 <- setdiff(a1,names(foyer))
# #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'categorie_salarie', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc')
# # hsup feuille d'impot
# # boursier pas dispo
# # inactif etc : extraire cela des donn?es clca etc
#
# # tester activit? car 0 vaut actif
# table(is.na(final2$activite),useNA="ifany")
#
# saveTmp(final2, file= "final2.Rdata")

control(final2, debug = True)
temporary_store['final2'] = final2
log.info("Nombre de personne d'âge NaN: {} ".format(final2.age.isnull().sum()))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def final_check(year=2006):
survey = HDFStore(survey_filename)

final2 = store.get('survey_2006')
print(survey)
finalT = survey.get('survey_2006')

varlist = [
Expand Down
6 changes: 0 additions & 6 deletions openfisca_france_data/erfs/old/datatable.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,16 +139,10 @@ def build_erfs_survey_collection():
config_local_ini = os.path.join(CONFIG_DIR, 'config_local.ini')
config_ini = os.path.join(CONFIG_DIR, 'config.ini')
found = parser.read(config_local_ini, config_ini)
print(found)

data_directory = parser.get('data', 'input_directory')
for table in erf_tables:
table["RData_filename"] = os.path.join(os.path.dirname(data_directory),'R','erf')





def initialize(self):
"""
Initialize survey data
Expand Down
8 changes: 6 additions & 2 deletions openfisca_france_data/erfs_fpr/get_survey_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,13 @@ def get_survey_scenario(
:param data: Les données de l'enquête.
:param reform: Une réforme à appliquer à *france_data_tax_benefit_system*.
"""
tax_benefit_system = get_tax_benefit_system(tax_benefit_system, reform)
tax_benefit_system = get_tax_benefit_system(
tax_benefit_system,
reform,
)

baseline_tax_benefit_system = get_baseline_tax_benefit_system(
baseline_tax_benefit_system
baseline_tax_benefit_system,
)

survey_scenario = ErfsFprSurveyScenario.create(
Expand Down
84 changes: 5 additions & 79 deletions openfisca_france_data/surveys.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@

import logging
import pandas
import numpy

from typing import Optional

from openfisca_core import periods, simulations # type: ignore
from openfisca_core import periods # type: ignore
from openfisca_core.taxbenefitsystems import TaxBenefitSystem # type: ignore
from openfisca_france_data import base_survey as base # type: ignore
from openfisca_survey_manager.scenarios import AbstractSurveyScenario # type: ignore
Expand Down Expand Up @@ -95,7 +94,7 @@ def create(

reform_is_provided = (reform is not None) or (reform_key is not None)
# With booleans != is xor
# See https://stackoverflow.com/questions/432842/how-do-you-get-the-logical-xor-of-two-variables-in-python
# See https://stackoverflow.com/questions/432842/how-do-you-get-the-logical-xor-of-two-variables-in-python # noqa: E501
assert reform_is_provided != (tax_benefit_system is not None)

if reform_is_provided:
Expand Down Expand Up @@ -152,7 +151,7 @@ def custom_initialize(self, simulation):
for offset in [-1, -2]:
for variable in three_year_span_variables:
assert variable in self.used_as_input_variables, \
f"{variable} is not a in the input_varaibles to be used {self.used_as_input_variables}"
f"{variable} is not a in the input_varaibles to be used {self.used_as_input_variables}" # noqa: E501

holder = simulation.get_holder(variable)

Expand All @@ -162,7 +161,8 @@ def custom_initialize(self, simulation):
simulation.calculate_add(variable, period = self.year),
)

except TypeError: # TODO Should explicitly test about Enums, avoid enums sum which is forbidden
# TODO: should explicitly test about Enums, enums sum is forbidden
except TypeError:
holder.set_input(
simulation.period.offset(offset),
simulation.calculate(
Expand All @@ -172,8 +172,6 @@ def custom_initialize(self, simulation):
)

def custom_input_data_frame(self, input_data_frame, **kwargs):
# input_data_frame['salaire_imposable_pour_inversion'] = input_data_frame.salaire_imposable

if "loyer" in input_data_frame:
input_data_frame["loyer"] = 12 * input_data_frame.loyer

Expand All @@ -184,75 +182,3 @@ def custom_input_data_frame(self, input_data_frame, **kwargs):

for variable in ["quifam", "quifoy", "quimen"]:
log.debug(input_data_frame[variable].value_counts(dropna = False))


def new_simulation_from_array_dict(
array_dict = None,
debug = False,
tax_benefit_system = None,
trace = False,
year = None,
):
simulation = simulations.Simulation(
debug = debug,
period = periods.period(year),
tax_benefit_system = tax_benefit_system,
trace = trace,
)

assert (len(set(len(x) for x in array_dict.itervalues() if len(x) != 1)) == 1), \
"Arrays do not have the same size"

global_count = len(array_dict.values()[0])

for role_var in ["quifam", "quifoy", "quimen"]:
if role_var not in array_dict:
array_dict[role_var] = numpy.zeros(global_count, dtype = int)

for id_var in ["idfam", "idfoy", "idmen"]:
if id_var not in array_dict:
array_dict[id_var] = numpy.arange(global_count, dtype = int)

column_by_name = tax_benefit_system.variables

for column_name, array in array_dict.items():
assert column_name in column_by_name, column_name

entity_by_key_plural = simulation.entity_by_key_plural

familles = entity_by_key_plural[u"familles"]
familles.count = familles.step_size = (array_dict["quifam"] == 0).sum()
foyers_fiscaux = entity_by_key_plural[u"foyers_fiscaux"]
foyers_fiscaux.count = foyers_fiscaux.step_size = (array_dict["quifoy"] == 0).sum()
individus = entity_by_key_plural[u"individus"]
individus.count = individus.step_size = global_count
menages = entity_by_key_plural[u"menages"]
menages.count = menages.step_size = (array_dict["quimen"] == 0).sum()

assert "idfam" in array_dict.keys()
assert "idfoy" in array_dict.keys()
assert "idmen" in array_dict.keys()
assert "quifam" in array_dict.keys()
assert "quifoy" in array_dict.keys()
assert "quimen" in array_dict.keys()

familles.roles_count = array_dict["quifam"].max() + 1
menages.roles_count = array_dict["quimen"].max() + 1
foyers_fiscaux.roles_count = array_dict["quifoy"].max() + 1

for column_name, column_array in array_dict.items():
holder = simulation.get_holder(column_name)
entity = holder.entity

if entity.is_persons_entity:
array = column_array

else:
array = column_array[array_dict["qui" + entity.symbol].values == 0]

assert array.size == entity.count, \
f"Bad size for {column_name}: {array.size} instead of {entity.count}"

holder.array = numpy.array(array, dtype = holder.variable.dtype)

return simulation
6 changes: 3 additions & 3 deletions openfisca_france_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,12 +240,12 @@ def build_cerfa_fields_by_column_name(year, sections_cerfa):

if end is None or end.year >= year:
if column.entity.key == "individu":
cerfa_field = ['f' + x.lower() for x in column.cerfa_field.values()]
cerfa_field = ["f" + x.lower() for x in column.cerfa_field.values()]

elif column.entity.key == "foyer_fiscal":
cerfa_field = ['f' + column.cerfa_field.lower()]
cerfa_field = ["f" + column.cerfa_field.lower()]

cerfa_fields_by_column_name[name.encode("ascii", "ignore")] = cerfa_field
cerfa_fields_by_column_name[name] = cerfa_field

return cerfa_fields_by_column_name

Expand Down
19 changes: 10 additions & 9 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
; E128/133: We prefer hang-closing visual indents
; E251: We prefer `function(x = 1)` over `function(x=1)`
; E501: We do not enforce a maximum line length
; F403/405: We ignore * imports
; W503/504: We break lines before binary operators (Knuth's style)

[flake8]
hang-closing = true
ignore = E128,E251,F403,F405,E501,W503
max-line-length = 88
hang-closing = true
ignore = E128,E251,F403,F405,W503

[pep8]
hang-closing = true
ignore = E128,E251,F403,F405,E501,W503
in-place = true
max-line-length = 88
hang-closing = true
ignore = E128,E251,F403,F405,W503
in-place = true

[tool:pytest]
addopts = --showlocals --doctest-modules --disable-pytest-warnings
testpaths = tests
python_files = **/*.py
addopts = --showlocals --doctest-modules --disable-pytest-warnings
testpaths = tests
python_files = **/*.py
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

setup(
name = "OpenFisca-France-Data",
version = "0.8.4",
version = "0.9.0",
description = "OpenFisca-France module to work with French survey data",
long_description = long_description,
author = "OpenFisca Team",
Expand Down
4 changes: 2 additions & 2 deletions tests/erfs_fpr/integration/test_af.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
individu = data_frame_by_entity['individu']
menage = data_frame_by_entity['menage']

#%%
population_by_age = individu.groupby('age')[['weight_individus']].sum().reset_index()
# %%
population_by_age = individu.groupby('age')['weight_individus'].sum().reset_index()
# Les 0-16 ans sont au moins 700 000 et au plus 850 000
assert (population_by_age.query('age <= 16 & age >= 0')['weight_individus'] > 7e5).all(), \
'Les 0-16 ans doivent être au moins 700 000'
Expand Down
Loading

0 comments on commit cd1a717

Please sign in to comment.