-
-
Notifications
You must be signed in to change notification settings - Fork 118
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Start EIA-176 pipelines: company data #2949
Changes from 16 commits
826a77a
bc6eddf
ff6b5bf
9903674
3a0bfe2
1fb52e9
8dbd975
1531313
9aff0a8
caaa212
fe3fbb7
0b703c6
cb8e7e1
4dc4ad9
87b7c51
82504f0
8f4d93e
35de6a8
35fabe6
07b48f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
:mod:`pudl.transform` subpackage. | ||
""" | ||
from . import ( | ||
eia176, | ||
eia860, | ||
eia860m, | ||
eia861, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
"""Extractor for CSV data.""" | ||
from csv import DictReader | ||
from importlib import resources | ||
from zipfile import ZipFile | ||
|
||
import pandas as pd | ||
|
||
import pudl.logging_helpers | ||
|
||
logger = pudl.logging_helpers.get_logger(__name__) | ||
|
||
|
||
def open_csv_resource(dataset: str, base_filename: str) -> DictReader: | ||
"""Open the given resource file as :class:`csv.DictReader`. | ||
|
||
Args: | ||
dataset: used to load metadata from package_data/{dataset} subdirectory. | ||
base_filename: the name of the file in the subdirectory to open. | ||
""" | ||
csv_path = resources.files(f"pudl.package_data.{dataset}") / base_filename | ||
return DictReader(csv_path.open()) | ||
|
||
|
||
def get_table_file_map(dataset: str) -> dict[str, str]: | ||
"""Return a dictionary of table names and filenames for the dataset. | ||
|
||
Args: | ||
dataset: used to load metadata from package_data/{dataset} subdirectory. | ||
""" | ||
return { | ||
row["table"]: row["filename"] | ||
for row in open_csv_resource(dataset, "table_file_map.csv") | ||
} | ||
|
||
|
||
class CsvExtractor: | ||
davidmudrauskas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Generalized class for extracting dataframes from CSV files. | ||
|
||
The extraction logic is invoked by calling extract() method of this class. | ||
""" | ||
|
||
def __init__(self, zipfile: ZipFile, table_file_map: dict[str, str]): | ||
"""Create a new instance of CsvExtractor. | ||
|
||
This can be used for retrieving data from CSV files. | ||
|
||
Args: | ||
zipfile: zipfile object containing source files | ||
table_file_map: map of table name to source file in zipfile archive | ||
""" | ||
self._zipfile = zipfile | ||
self._table_file_map = table_file_map | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're worried about the table file map and the zipfile not matching up, you could validate that the files in |
||
|
||
def get_table_names(self) -> list[str]: | ||
"""Returns list of tables that this extractor provides access to.""" | ||
return list(self._table_file_map) | ||
|
||
def extract_one(self, table_name: str) -> pd.DataFrame: | ||
"""Read the data from the CSV source file and return as a dataframe.""" | ||
logger.info(f"Extracting {table_name} from CSV into pandas DataFrame.") | ||
filename = self._table_file_map[table_name] | ||
with self._zipfile.open(filename) as f: | ||
df = pd.read_csv(f) | ||
return df | ||
|
||
def extract_all(self) -> dict[str, pd.DataFrame]: | ||
"""Extracts a dictionary of table names and dataframes from CSV source files.""" | ||
data = {} | ||
for table_name in self.get_table_names(): | ||
df = self.extract_one(table_name) | ||
data[table_name] = df | ||
return data |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
"""Extract EIA Form 176 data from CSVs. | ||
|
||
The EIA Form 176 archive also contains CSVs for EIA Form 191 and EIA Form 757. | ||
davidmudrauskas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
|
||
from dagster import asset | ||
|
||
from pudl.extract.csv import CsvExtractor, get_table_file_map | ||
|
||
DATASET = "eia176" | ||
|
||
|
||
@asset(required_resource_keys={"datastore"}) | ||
def raw_eia176__company(context): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wrote a concrete asset like this for now, after trying to get a factory pattern down. Happy to learn more about the Dagster components and write a factory at some point. |
||
"""Extract raw EIA company data from CSV sheets into dataframes. | ||
|
||
Args: | ||
context: dagster keyword that provides access to resources and config. | ||
|
||
Returns: | ||
An extracted EIA dataframe with company data. | ||
""" | ||
zipfile = context.resources.datastore.get_zipfile_resource(DATASET) | ||
table_file_map = get_table_file_map(DATASET) | ||
extractor = CsvExtractor(zipfile, table_file_map) | ||
extractor.extract_one("company") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
table,filename | ||
company,all_company_176.csv | ||
davidmudrauskas marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""Unit tests for pudl.extract.csv module.""" | ||
from unittest.mock import MagicMock, patch | ||
|
||
from pudl.extract.csv import CsvExtractor, get_table_file_map, open_csv_resource | ||
|
||
DATASET = "eia176" | ||
BASE_FILENAME = "table_file_map.csv" | ||
TABLE_NAME = "company" | ||
FILENAME = "all_company_176.csv" | ||
TABLE_FILE_MAP = {TABLE_NAME: FILENAME} | ||
|
||
|
||
def get_csv_extractor(): | ||
zipfile = MagicMock() | ||
return CsvExtractor(zipfile, TABLE_FILE_MAP) | ||
|
||
|
||
def test_open_csv_resource(): | ||
csv_resource = open_csv_resource(DATASET, BASE_FILENAME) | ||
assert ["table", "filename"] == csv_resource.fieldnames | ||
|
||
|
||
def test_get_table_file_map(): | ||
table_file_map = get_table_file_map(DATASET) | ||
assert table_file_map == TABLE_FILE_MAP | ||
|
||
|
||
def test_get_table_names(): | ||
extractor = get_csv_extractor() | ||
table_names = extractor.get_table_names() | ||
assert [TABLE_NAME] == table_names | ||
|
||
|
||
@patch("pudl.extract.csv.pd") | ||
def test_csv_extractor_read_source(mock_pd): | ||
extractor = get_csv_extractor() | ||
res = extractor.extract_one(TABLE_NAME) | ||
mock_zipfile = extractor._zipfile | ||
mock_zipfile.open.assert_called_once_with(FILENAME) | ||
f = mock_zipfile.open.return_value.__enter__.return_value | ||
mock_pd.read_csv.assert_called_once_with(f) | ||
df = mock_pd.read_csv() | ||
assert df == res | ||
|
||
|
||
def test_csv_extractor_extract(): | ||
extractor = get_csv_extractor() | ||
df = MagicMock() | ||
with patch.object(CsvExtractor, "extract_one", return_value=df) as mock_read_source: | ||
raw_dfs = extractor.extract_all() | ||
mock_read_source.assert_called_once_with(TABLE_NAME) | ||
assert {TABLE_NAME: df} == raw_dfs |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be moved to an even more general space at some point but I didn't want to introduce another moving part in this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I could see this moving to
pudl.helpers
or something but this is a fine place for it.