diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..c0df623 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,28 @@ +name: docs + +on: + push: + branches: + - main +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + strategy: + max-parallel: 4 + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: Install dependencies + run: | + pip install -e . + pip install hatch + - name: Deploy docs + run: hatch run docs:deploy \ No newline at end of file diff --git a/README.md b/README.md index f9ad285..8be017b 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,12 @@ A python module for getting useful data out of ixbrl files. The library is at an early stage - feedback and improvements are very welcome. +Full documentation is available at [dkane.net/ixbrl-parse/](https://dkane.net/ixbrl-parse/) + ## Changelog +**New in version 0.7.0**: Add plugin support. Add documentation + **New in version 0.6.0**: Switch to use the [hatch](https://hatch.pypa.io/latest/) build and development system. **New in version 0.5.4**: Added backreferences to BeautifulSoup objects - thanks to @avyfain for PR. @@ -62,8 +66,6 @@ python -m ixbrlparse -h ### Use as a python module -An example of usage is shown in [`test.py`](test.py). - #### Import the `IXBRL` class which parses the file. ```python @@ -159,7 +161,7 @@ Note that the error catching is only available for parsing of `.nonnumeric` and `numeric` items in the document. Any other errors with parsing will be thrown as normal no matter what `raise_on_error` is set to. -## Code checks +## Development The module is setup for development using [hatch](https://hatch.pypa.io/latest/). diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..5a78422 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,13 @@ +# Changelog + +**New in version 0.7.0**: Add plugin support. Add documentation + +**New in version 0.6.0**: Switch to use the [hatch](https://hatch.pypa.io/latest/) build and development system. + +**New in version 0.5.4**: Added backreferences to BeautifulSoup objects - thanks to @avyfain for PR. + +**New in version 0.5.3**: Support for `exclude` and `continuation` elements within XBRL documents. Thanks to @wcollinscw for adding support for exclude elements. + +**New in version 0.5**: Support for Python 3.11 has been added. I've had some problems with Python 3.11 and Windows as lxml binaries aren't yet available. Also new in version 0.5 is type checking - the whole library now has types added. + +**New in version 0.4**: I've added initial support for pure XBRL files as well as tagged HTML iXBRL files. Feedback on this feature is welcome - particularly around getting values out of numeric items. \ No newline at end of file diff --git a/docs/command-line.md b/docs/command-line.md new file mode 100644 index 0000000..370ac5e --- /dev/null +++ b/docs/command-line.md @@ -0,0 +1,22 @@ +# Command line + +You can run the module directly to extract data from an IXBRL file. + +```bash +ixbrlparse example_file.html +# or +python -m ixbrlparse example_file.html +``` + +The various options for using this can be found through: + +```bash +python -m ixbrlparse -h +# optional arguments: +# -h, --help show this help message and exit +# --outfile OUTFILE Where to output the file +# --format {csv,json,jsonlines,jsonl} +# format of the output +# --fields {numeric,nonnumeric,all} +# Which fields to output +``` \ No newline at end of file diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000..58e7c8e --- /dev/null +++ b/docs/development.md @@ -0,0 +1,62 @@ +# Development + +The module is setup for development using [hatch](https://hatch.pypa.io/latest/). + +## Run tests + +Tests can be run with `pytest`: + +```bash +hatch run test +``` + +## Test coverage + +Run tests then report on coverage + +```bash +hatch run cov +``` + +Run tests then run a server showing where coverage is missing + +```bash +hatch run cov-html +``` + +## Run typing checks + +```bash +hatch run lint:typing +``` + +## Linting + +Black and ruff should be run before committing any changes. + +To check for any changes needed: + +```bash +hatch run lint:style +``` + +To run any autoformatting possible: + +```sh +hatch run lint:fmt +``` + +## Run all checks at once + +```sh +hatch run lint:all +``` + +# Publish to pypi + +```bash +hatch build +hatch publish +git tag v +git push origin v +``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..11aba38 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,29 @@ +# ixbrlParse + +![Test status](https://github.com/drkane/ixbrl-parse/workflows/tests/badge.svg) +[![PyPI version](https://img.shields.io/pypi/v/ixbrlparse)](https://pypi.org/project/ixbrlparse/) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ixbrlparse) +![PyPI - License](https://img.shields.io/pypi/l/ixbrlparse) + +A python module for getting useful data out of ixbrl files. The library is at an early stage - feedback and improvements are very welcome. + +## Requirements + +The module requires [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and [lxml](https://lxml.de/) to parse the documents. + +[word2number](https://github.com/akshaynagpal/w2n) is used to process the +numeric items with the `numsenwords` format. + +## How to install + +You can install from pypi using pip: + +``` +pip install ixbrlparse +``` + +## Acknowledgements + +Originally developed for a project with +[Power to Change](https://www.powertochange.org.uk/) looking at how to extract data from +financial documents of community businesses. diff --git a/docs/plugins.md b/docs/plugins.md new file mode 100644 index 0000000..83b23e9 --- /dev/null +++ b/docs/plugins.md @@ -0,0 +1,59 @@ +# Plugins + +The module allows for plugins to customize functionality, using the [pluggy](https://pluggy.readthedocs.io/en/stable/) framework. + +The only current plugin endpoint is to add more Formatters. A formatter takes a value from a ixbrl item and converts it into the appropriate python value. For example, the `ixtNumWordsEn` formatter would take a value like "eighty-five" and turn it into 85. + +The formats used within ixbrl files can vary between schemas and countries. Rather than try to cover everything in this module, you can write a plugin to support the format that you need. + +## Creating a plugin + +### Create a custom format class + +To create a plugin, you first need to create a new format class that subclasses `ixbrlparse.ixbrlFormat`. This has two key components: + +- a `format_names` attribute which consists of a tuple of possible names for the format. These are the values that will be checked against the ixbrl items. These names must not clash with other formats that have already been defined. +- a `parse_value` function which takes the original text value and returns the processed value. + +An example class might look like (in the file `ixbrlparse-dateplugin/ixbrlparse_dateplugin.py`): + +```python +import ixbrlparse + +class ixtParseIsoDate(ixbrlparse.ixbrlFormat): + format_names = ("isodateformat") + + def parse_value(self, value): + return datetime.datetime.strptime(value, "%Y-%m-%d").astimezone().date() +``` + +### Hook into ixbrlparse + +Next you need to add a function which will hook into ixbrlparse at the right point. This function needs to be called `ixbrl_add_formats`, and returns a list of new format classes (added to the bottom of `ixbrlparse-dateplugin/ixbrlparse_dateplugin.py`): + +```python +@ixbrlparse.hookimpl +def ixbrl_add_formats(): + return [ixtParseIsoDate] +``` + +You then need to add an entrypoint to `setup.py` or to `pyproject.toml` which will be activated when your project is installed. This should look something like (using an example `ixbrlparse-dateplugin/setup.py`): + +```python +from setuptools import setup + +setup( + name="ixbrlparse-dateplugin", + install_requires="ixbrlparse", + entry_points={"ixbrlparse": ["dateplugin = ixbrlparse_dateplugin"]}, + py_modules=["ixbrlparse_dateplugin"], +) +``` + +### Install the plugin + +If you then install the plugin it should be picked up by ixbrlparse and will also include the additional formats when checking. + +## Acknowledgements + +The implementation of pluggy used here draws heavily on [pluggy's own tutorial](https://pluggy.readthedocs.io/en/stable/#a-complete-example) and @simonw's [implementation of plugins for datasette](https://docs.datasette.io/en/stable/plugins.html). diff --git a/docs/python-module.md b/docs/python-module.md new file mode 100644 index 0000000..8b39a3e --- /dev/null +++ b/docs/python-module.md @@ -0,0 +1,96 @@ +# Python module + +## Import the `IXBRL` class which parses the file. + +```python +from ixbrlparse import IXBRL +``` + +## Initialise an object and parse the file + +You need to pass a file handle or other object with a `.read()` method. + +```python +with open('sample_ixbrl.html', encoding="utf8") as a: + x = IXBRL(a) +``` + +If your IXBRL data comes as a string then use a `io.StringIO` wrapper to +pass it to the class: + +```python +import io +from ixbrlparse import IXBRL + +content = '''''' +x = IXBRL(io.StringIO(content)) +``` + + +## Get the contexts and units used in the data + +These are held in the object. The contexts are stored as a dictionary with the context +id as the key, and a `ixbrlContext` object as the value. + +```python +print(x.contexts) +# { +# "cfwd_2018_03_31": ixbrlContext( +# id="cfwd_2018_03_31", +# entity="0123456", # company number +# segments=[], # used for hypercubes +# instant="2018-03-31", +# startdate=None, # used for periods +# enddate=None, # used for periods +# ), +# .... +# } +``` + +The units are stored as key:value dictionary entries +```python +print(x.units) +# { +# "GBP": "ISO4107:GBP" +# "shares": "shares" +# } +``` + +## Get financial facts + +Numeric facts are stored in `x.numeric` as a list of `ixbrlNumeric` objects. +The `ixbrlNumeric.value` object contains the value as a parsed python number +(after the sign and scale formatting values have been applied). + +`ixbrlNumeric.context` holds the context object relating to this value. +The `.name` and `.schema` values give the key of this value, according to +the applied schema. + +Non-numeric facts are stored in `x.nonnumeric` as a list of `ixbrlNonnumeric` +objects, with similar `.value`, `.context`, `.name` and `.schema` values. +The value of `.value` will be a string for non-numeric facts. + +## Check for any parsing errors + +By default, the parser will throw an exception if it encounters an error +when processing the document. + +You can parse `raise_on_error=False` to the initial object to suppress +these exceptions. You can then access a list of the errors (and the element) +that created them through the `.errors` attribute. For example: + +```python +with open('sample_ixbrl.html', encoding="utf8") as a: + x = IXBRL(a, raise_on_error=False) + print(x.errors) # populated with any exceptions found + # [ eg... + # { + # "error": , + # "element": + # } + # ] +``` + +Note that the error catching is only available for parsing of `.nonnumeric` +and `numeric` items in the document. Any other errors with parsing will be +thrown as normal no matter what `raise_on_error` is set to. diff --git a/docs/reference.md b/docs/reference.md new file mode 100644 index 0000000..6969c0f --- /dev/null +++ b/docs/reference.md @@ -0,0 +1,21 @@ +# API documentation + +## ixbrlparse.IXBRL + +::: src.ixbrlparse.core.IXBRL + +## ixbrlparse.ixbrlFormat + +::: src.ixbrlparse.components._base.ixbrlFormat + +## ixbrlparse.ixbrlContext + +::: src.ixbrlparse.components.context.ixbrlContext + +## ixbrlparse.ixbrlNonNumeric + +::: src.ixbrlparse.components.nonnumeric.ixbrlNonNumeric + +## ixbrlparse.ixbrlNumeric + +::: src.ixbrlparse.components.numeric.ixbrlNumeric diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..b41ac8d --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,27 @@ +site_name: ixbrlParse +repo_url: https://github.com/drkane/ixbrl-parse +site_description: A python module for getting useful data out of ixbrl files. +site_author: David Kane +theme: + name: material + features: + # - navigation.tabs + - navigation.sections +nav: + - index.md + - changelog.md + - 'Usage': + - 'command-line.md' + - 'python-module.md' + - 'Development': + - 'development.md' + - 'plugins.md' + - reference.md +plugins: +- mkdocstrings: + enabled: true + default_handler: python + handlers: + python: + options: + heading_level: 3 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index de4a673..8252233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,11 +27,12 @@ dependencies = [ "beautifulsoup4", "lxml", "word2number", + "pluggy", ] [project.urls] Homepage = "https://github.com/drkane/ixbrl-parse" -Documentation = "https://github.com/drkane/ixbrl-parse#readme" +Documentation = "https://dkane.net/ixbrl-parse/" Issues = "https://github.com/drkane/ixbrl-parse/issues" Source = "https://github.com/drkane/ixbrl-parse" @@ -73,6 +74,17 @@ cov-html = [ [[tool.hatch.envs.all.matrix]] python = ["3.8", "3.9", "3.10", "3.11"] +[tool.hatch.envs.docs] +dependencies = [ + "mkdocs", + "mkdocs-material", + "mkdocstrings[python]", +] +[tool.hatch.envs.docs.scripts] +serve = "mkdocs serve" +build = "mkdocs build" +deploy = "mkdocs gh-deploy --force" + [tool.hatch.envs.lint] detached = true dependencies = [ diff --git a/src/ixbrlparse/__about__.py b/src/ixbrlparse/__about__.py index 906d362..49e0fc1 100644 --- a/src/ixbrlparse/__about__.py +++ b/src/ixbrlparse/__about__.py @@ -1 +1 @@ -__version__ = "0.6.0" +__version__ = "0.7.0" diff --git a/src/ixbrlparse/__init__.py b/src/ixbrlparse/__init__.py index bec621c..b7de331 100644 --- a/src/ixbrlparse/__init__.py +++ b/src/ixbrlparse/__init__.py @@ -1,5 +1,5 @@ +from ixbrlparse.components import ixbrlContext, ixbrlFormat, ixbrlNonNumeric, ixbrlNumeric from ixbrlparse.core import IXBRL +from ixbrlparse.hookspecs import hookimpl, hookspec -__all__ = [ - "IXBRL", -] +__all__ = ["IXBRL", "hookimpl", "hookspec", "ixbrlContext", "ixbrlFormat", "ixbrlNonNumeric", "ixbrlNumeric"] diff --git a/src/ixbrlparse/cli/__init__.py b/src/ixbrlparse/cli/__init__.py index a8cd9ed..c735eaf 100644 --- a/src/ixbrlparse/cli/__init__.py +++ b/src/ixbrlparse/cli/__init__.py @@ -2,6 +2,7 @@ import json import logging import sys +from datetime import date from typing import Any, Dict import click @@ -44,6 +45,8 @@ def ixbrlparse_cli(output_format: str, fields: str, outfile, infile): elif output_format in ["jsonlines", "jsonl"]: values = x.to_table(fields) for v in values: + if isinstance(v["value"], date): + v["value"] = str(v["value"]) json.dump(v, outfile) outfile.write("\n") elif output_format == "json": diff --git a/src/ixbrlparse/components/__init__.py b/src/ixbrlparse/components/__init__.py index 1c18589..43613a0 100644 --- a/src/ixbrlparse/components/__init__.py +++ b/src/ixbrlparse/components/__init__.py @@ -1,5 +1,6 @@ +from ixbrlparse.components._base import ixbrlFormat from ixbrlparse.components.context import ixbrlContext from ixbrlparse.components.nonnumeric import ixbrlNonNumeric from ixbrlparse.components.numeric import ixbrlNumeric -__all__ = ["ixbrlContext", "ixbrlNonNumeric", "ixbrlNumeric"] +__all__ = ["ixbrlContext", "ixbrlNonNumeric", "ixbrlNumeric", "ixbrlFormat"] diff --git a/src/ixbrlparse/components/_base.py b/src/ixbrlparse/components/_base.py new file mode 100644 index 0000000..da5aff4 --- /dev/null +++ b/src/ixbrlparse/components/_base.py @@ -0,0 +1,81 @@ +from copy import deepcopy +from datetime import date +from typing import List, Optional, Tuple, Union + + +class ixbrlFormat: # noqa: N801 + """Class to represent an ixbrl format. + + This class should generally be subclassed to provide additional functionality. + + Attributes: + format_names: A tuple of format names that this class should be used for.""" + + format_names: Tuple[str, ...] = () + + def __init__( + self, + format_: str, + decimals: Optional[Union[int, str]] = None, + scale: Union[int, str] = 0, + sign: Optional[str] = None, + ) -> None: + """Initialise the ixbrl format object. + + Parameters: + format_: The name of the format. + decimals: The number of decimal places (only used for numeric formats). + scale: The scale of the format (only for numeric formats). + If more than 0 this value is used as the exponent for a value, so for example with a scale of + 4 and a value of 20, the parsed value is 20 * (10 ^ 4) == 200000. + sign: The sign of the format (only for numeric formats). The sign given is usually "-" or empty. + """ + if isinstance(decimals, str): + if decimals.lower() == "inf": + self.decimals = None + else: + self.decimals = int(decimals) + + self.format: Optional[str] = None + self.namespace: Optional[str] = None + if format_: + format_array: List[str] = format_.split(":") + if len(format_array) > 1: + self.format = ":".join(format_array[1:]) + self.namespace = format_array[0] + else: + self.format = ":".join(format_array) + self.namespace = None + + self.scale = int(scale) + self.sign = sign + + def to_json(self): + """Convert the object to a JSON serialisable dictionary.""" + return deepcopy(self.__dict__) + + def parse_value(self, value: Union[str, int, float]) -> Optional[Union[int, float, bool, date]]: + """Parse a value using the format. + + Parameters: + value: The value to parse. + + Returns: + The parsed value in the appropriate python type. + """ + if isinstance(value, (int, float)): + return value + + if isinstance(value, str): + if value in ("-", ""): + return 0 + + value_numeric: float = float(value.replace(" ", "").replace(",", "")) + + if self.sign == "-": + value_numeric = value_numeric * -1 + + if self.scale != 0: + value_numeric = value_numeric * (10**self.scale) + + return value_numeric diff --git a/src/ixbrlparse/components/context.py b/src/ixbrlparse/components/context.py index 63fc623..fcb2d0b 100644 --- a/src/ixbrlparse/components/context.py +++ b/src/ixbrlparse/components/context.py @@ -4,6 +4,18 @@ class ixbrlContext: # noqa: N801 + """Class to represent an ixbrl context. + + The context should either have an instant date or a start and end date. + + Attributes: + id: The id of the context. + entity: A dictionary of the entity information. + segments: A list of dictionaries of the segment information. + instant: The instant date of the context. + startdate: The start date of the context. + enddate: The end date of the context.""" + def __init__( self, _id: str, @@ -37,6 +49,7 @@ def __repr__(self) -> str: return f"" def to_json(self) -> Dict[str, List[Dict[str, Any]]]: + """Convert the object to a JSON serialisable dictionary.""" values = deepcopy(self.__dict__) for i in ["startdate", "enddate", "instant"]: if isinstance(values[i], datetime.date): diff --git a/src/ixbrlparse/components/formats.py b/src/ixbrlparse/components/formats.py new file mode 100644 index 0000000..4e9a74e --- /dev/null +++ b/src/ixbrlparse/components/formats.py @@ -0,0 +1,165 @@ +import datetime +import re +import warnings +from typing import List, Optional, Tuple, Type, Union + +from ixbrlparse.components._base import ixbrlFormat +from ixbrlparse.hookspecs import hookimpl + + +class ixtZeroDash(ixbrlFormat): # noqa: N801 + format_names = ( + "zerodash", + "numdash", + "fixedzero", + "ixt:zerodash", + "ixt:numdash", + "ixt:fixedzero", + ) + + def parse_value(self, *_args, **_kwargs) -> Union[int, float]: + return 0 + + +class ixtNoContent(ixbrlFormat): # noqa: N801 + format_names = ( + "nocontent", + "fixedempty", + "ixt:nocontent", + "ixt:fixedempty", + ) + + def parse_value(self, *_args, **_kwargs) -> None: + return None + + +class ixtFixedFalse(ixbrlFormat): # noqa: N801 + format_names = ( + "booleanfalse", + "fixedfalse", + "ixt:booleanfalse", + "ixt:fixedfalse", + ) + + def parse_value(self, *_args, **_kwargs) -> bool: + return False + + +class ixtFixedTrue(ixbrlFormat): # noqa: N801 + format_names = ( + "booleantrue", + "fixedtrue", + "ixt:booleantrue", + "ixt:fixedtrue", + ) + + def parse_value(self, *_args, **_kwargs) -> bool: + return True + + +class ixtNumComma(ixbrlFormat): # noqa: N801 + format_names = ( + "numcomma", + "numdotcomma", + "numspacecomma", + "numcommadecimal", + "ixt:numcomma", + "ixt:numdotcomma", + "ixt:numspacecomma", + "ixt:numcommadecimal", + ) + + def parse_value(self, value: Union[str, int, float]) -> Optional[Union[int, float]]: + if isinstance(value, str): + value = value.replace(".", "") + value = value.replace(",", ".") + parsed_value = super().parse_value(value) + if isinstance(parsed_value, (float, int)): + return parsed_value + msg = f"Could not parse value {value} as a number" # pragma: no cover + warnings.warn(msg, stacklevel=2) # pragma: no cover + return None # pragma: no cover + + +class ixtNumWordsEn(ixbrlFormat): # noqa: N801 + format_names = ( + "numwordsen", + "ixt:numwordsen", + ) + + def parse_value(self, value: Union[str, int, float]) -> Optional[Union[int, float]]: + if isinstance(value, str): + value = value.lower() + if value in ("no", "none"): + return 0 + from word2number import w2n + + return w2n.word_to_num(value) + parsed_value = super().parse_value(value) + if isinstance(parsed_value, (float, int)): + return parsed_value + msg = f"Could not parse value {value} as a number" # pragma: no cover + warnings.warn(msg, stacklevel=2) # pragma: no cover + return None # pragma: no cover + + +class ixtNumDotDecimal(ixbrlFormat): # noqa: N801 + format_names = ( + "numdotdecimal", + "numcommadot", + "numspacedot", + "ixt:numdotdecimal", + "ixt:numcommadot", + "ixt:numspacedot", + ) + + +DATE_ORDINAL_SUFFIX_REGEX = re.compile(r"([0-9]{1,2})(st|nd|rd|th)\b") + + +class ixtDateFormat(ixbrlFormat): # noqa: N801 + format_names: Tuple[str, ...] = () + date_format = "%Y-%m-%d" + + def parse_value(self, value: Union[str, int, float]) -> Optional[datetime.date]: + if isinstance(value, str): + value = value.lower() + # remove ordinal suffixes with regex + value = DATE_ORDINAL_SUFFIX_REGEX.sub(r"\1", value) + return datetime.datetime.strptime(value, self.date_format).astimezone().date() + msg = f"Could not parse value {value} as a date" + warnings.warn(msg, stacklevel=2) + return None + + +class ixtDateLongUK(ixtDateFormat): # noqa: N801 + format_names = ( + "datelonguk", + "datedaymonthyearen", + "ixt:datelonguk", + "ixt:datedaymonthyearen", + ) + date_format = "%d %B %Y" + + +class ixtDateDayMonthYear(ixtDateFormat): # noqa: N801 + format_names = ( + "datedaymonthyear", + "ixt:datedaymonthyear", + ) + date_format = "%d.%m.%y" + + +@hookimpl +def ixbrl_add_formats() -> List[Type[ixbrlFormat]]: + return [ + ixtZeroDash, + ixtNoContent, + ixtFixedFalse, + ixtFixedTrue, + ixtNumDotDecimal, + ixtNumComma, + ixtNumWordsEn, + ixtDateLongUK, + ixtDateDayMonthYear, + ] diff --git a/src/ixbrlparse/components/nonnumeric.py b/src/ixbrlparse/components/nonnumeric.py index 0b611e6..75b54c8 100644 --- a/src/ixbrlparse/components/nonnumeric.py +++ b/src/ixbrlparse/components/nonnumeric.py @@ -1,36 +1,52 @@ +import warnings from copy import deepcopy +from datetime import date from typing import Any, Dict, List, Optional, Union from bs4 import Tag from ixbrlparse.components import ixbrlContext from ixbrlparse.components.constants import NAME_SPLIT_EXPECTED +from ixbrlparse.components.transform import get_format, ixbrlFormat class ixbrlNonNumeric: # noqa: N801 def __init__( self, - context: Union[ixbrlContext, str, None], - name: str, - format_: Optional[str], - value: str, + context: Optional[Union[ixbrlContext, str]] = None, + name: Optional[str] = None, + format_: Optional[str] = None, + value: Optional[str] = None, soup_tag: Optional[Tag] = None, ) -> None: - name_split: List[str] = name.split(":", maxsplit=1) - if len(name_split) == NAME_SPLIT_EXPECTED: - self.schema = name_split[0] - self.name = name_split[1] - else: - self.schema = "unknown" - self.name = name_split[0] + if isinstance(name, str): + name_split: List[str] = name.split(":", maxsplit=1) + if len(name_split) == NAME_SPLIT_EXPECTED: + self.schema = name_split[0] + self.name = name_split[1] + else: + self.schema = "unknown" + self.name = name_split[0] self.context = context - self.format = format_ - self.value = value + self.format: Optional[ixbrlFormat] = None + self.text: Optional[str] = value + self.value: Optional[int | float | date | None | str] = value + if isinstance(format_, str) and format_ != "" and self.text is not None: + try: + self.format = get_format(format_)(format_=format_) + self.value = self.format.parse_value(self.text) + except NotImplementedError: + msg = f"Format {format_} not implemented - value '{value}' not parsed" + warnings.warn(msg, stacklevel=2) self.soup_tag = soup_tag def to_json(self) -> Dict[str, Any]: values = {k: deepcopy(v) for k, v in self.__dict__.items() if k != "soup_tag"} + if isinstance(self.value, date): + values["value"] = self.value.isoformat() + if isinstance(self.format, ixbrlFormat): + values["format"] = self.format.to_json() if isinstance(self.context, ixbrlContext): values["context"] = self.context.to_json() return values diff --git a/src/ixbrlparse/components/numeric.py b/src/ixbrlparse/components/numeric.py index 8652684..879d045 100644 --- a/src/ixbrlparse/components/numeric.py +++ b/src/ixbrlparse/components/numeric.py @@ -61,7 +61,9 @@ def __init__( try: if isinstance(self.format, ixbrlFormat): - self.value = self.format.parse_value(self.text) + parsed_value = self.format.parse_value(self.text) + if isinstance(parsed_value, (int, float)): + self.value = parsed_value except ValueError: logging.info(attrs) raise diff --git a/src/ixbrlparse/components/transform.py b/src/ixbrlparse/components/transform.py index 8f2f780..ce19a60 100644 --- a/src/ixbrlparse/components/transform.py +++ b/src/ixbrlparse/components/transform.py @@ -1,95 +1,7 @@ -from copy import deepcopy -from typing import List, Optional, Type, Union +from typing import List, Optional, Type - -class ixbrlFormat: # noqa: N801 - def __init__( - self, - format_: str, - decimals: Optional[Union[int, str]], - scale: Union[int, str] = 1, - sign: Optional[str] = None, - ) -> None: - if isinstance(decimals, str): - if decimals.lower() == "inf": - self.decimals = None - else: - self.decimals = int(decimals) - - self.format: Optional[str] = None - self.namespace: Optional[str] = None - if format_: - format_array: List[str] = format_.split(":") - if len(format_array) > 1: - self.format = ":".join(format_array[1:]) - self.namespace = format_array[0] - else: - self.format = ":".join(format_array) - self.namespace = None - - self.scale = int(scale) - self.sign = sign - - def to_json(self): - return deepcopy(self.__dict__) - - def parse_value(self, value: Union[str, int, float]) -> Optional[Union[int, float, bool]]: - if isinstance(value, (int, float)): - return value - - if isinstance(value, str): - if value in ("-", ""): - return 0 - - value_numeric: float = float(value.replace(" ", "").replace(",", "")) - - if self.sign == "-": - value_numeric = value_numeric * -1 - - if self.scale != 0: - value_numeric = value_numeric * (10**self.scale) - - return value_numeric - - -class ixtZeroDash(ixbrlFormat): # noqa: N801 - def parse_value(self, *_args, **_kwargs) -> Union[int, float]: - return 0 - - -class ixtNoContent(ixbrlFormat): # noqa: N801 - def parse_value(self, *_args, **_kwargs) -> None: - return None - - -class ixtFixedFalse(ixbrlFormat): # noqa: N801 - def parse_value(self, *_args, **_kwargs) -> bool: - return False - - -class ixtFixedTrue(ixbrlFormat): # noqa: N801 - def parse_value(self, *_args, **_kwargs) -> bool: - return True - - -class ixtNumComma(ixbrlFormat): # noqa: N801 - def parse_value(self, value: Union[str, int, float]) -> Optional[Union[int, float]]: - if isinstance(value, str): - value = value.replace(".", "") - value = value.replace(",", ".") - return super().parse_value(value) - - -class ixtNumWordsEn(ixbrlFormat): # noqa: N801 - def parse_value(self, value: Union[str, int, float]) -> Optional[Union[int, float]]: - if isinstance(value, str): - value = value.lower() - if value in ("no", "none"): - return 0 - from word2number import w2n - - return w2n.word_to_num(value) - return super().parse_value(value) +from ixbrlparse.components._base import ixbrlFormat +from ixbrlparse.plugins import pm def get_format(format_: Optional[str]) -> Type[ixbrlFormat]: @@ -108,26 +20,20 @@ def get_format(format_: Optional[str]) -> Type[ixbrlFormat]: format_ = format_.replace("-", "") - if format_ in ("zerodash", "numdash", "fixedzero"): - return ixtZeroDash - - if format_ in ("nocontent", "fixedempty"): - return ixtNoContent - - if format_ in ("booleanfalse", "fixedfalse"): - return ixtFixedFalse - - if format_ in ("booleantrue", "fixedtrue"): - return ixtFixedTrue - - if format_ in ("numdotdecimal", "numcommadot", "numspacedot"): - return ixbrlFormat - - if format_ in ("numcomma", "numdotcomma", "numspacecomma", "numcommadecimal"): - return ixtNumComma - - if format_ in ("numwordsen"): - return ixtNumWordsEn + formats = {} + for additional_formats in pm.hook.ixbrl_add_formats(): + for format_class in additional_formats: + for format_str in format_class.format_names: + if format_str in formats: + msg = 'Format "{}" already exists (namespace "{}")'.format( + format_str, + namespace, + ) + raise ValueError(msg) + formats[format_str] = format_class + + if format_ in formats: + return formats[format_] msg = 'Format "{}" not implemented (namespace "{}")'.format( original_format, diff --git a/src/ixbrlparse/core.py b/src/ixbrlparse/core.py index cf42358..7a45f0b 100644 --- a/src/ixbrlparse/core.py +++ b/src/ixbrlparse/core.py @@ -276,7 +276,17 @@ def _get_nonnumeric(self) -> None: class IXBRL: + """ + Parse an iXBRL file. + """ + def __init__(self, f: IO, raise_on_error: bool = True) -> None: # noqa: FBT001, FBT002 + """Constructor for the IXBRL class. + + Parameters: + f: File-like object to parse. + raise_on_error: Whether to raise an exception on error + """ self.soup = BeautifulSoup(f.read(), "xml", multi_valued_attributes=None) self.raise_on_error = raise_on_error self._get_parser() @@ -288,6 +298,12 @@ def __init__(self, f: IO, raise_on_error: bool = True) -> None: # noqa: FBT001, @classmethod def open(cls, filename: Union[str, Path], raise_on_error: bool = True): # noqa: FBT001, FBT002, A003 + """Open an iXBRL file. + + Parameters: + filename: Path to file to parse. + raise_on_error: Whether to raise an exception on error + """ with open(filename, "rb") as a: return cls(a, raise_on_error=raise_on_error) @@ -307,6 +323,19 @@ def __getattr__(self, name: str): return getattr(self.parser, name) def to_json(self) -> Dict: + """Return a JSON representation of the iXBRL file. + + Returns: + A dictionary containing the following keys: + + - schema: The schema used in the iXBRL file. + - namespaces: The namespaces used in the iXBRL file. + - contexts: The contexts used in the iXBRL file. + - units: The units used in the iXBRL file. + - nonnumeric: The non-numeric elements in the iXBRL file. + - numeric: The numeric elements in the iXBRL file. + - errors: The number of errors encountered when parsing the iXBRL file. + """ return { "schema": self.schema, "namespaces": self.namespaces, @@ -318,6 +347,22 @@ def to_json(self) -> Dict: } def to_table(self, fields: str = "numeric") -> List[Dict]: + """Return a list of dictionaries representing the iXBRL file. + + This is suitable for passing to pandas.DataFrame.from_records(). + + Parameters: + fields: Which fields to include in the output. Can be "numeric", "nonnumeric" or "all". + + Returns: + A list of dictionaries representing the iXBRL file. + + Examples: + >>> import pandas as pd + >>> i = IXBRL.open("tests/fixtures/ixbrl/uk-gaap/2009-12-31/Company-Accounts-Data.xml") + >>> df = pd.DataFrame.from_records(i.to_table(fields="numeric")) + >>> df.head() + """ if fields == "nonnumeric": values = self.nonnumeric elif fields == "numeric": diff --git a/src/ixbrlparse/hookspecs.py b/src/ixbrlparse/hookspecs.py new file mode 100644 index 0000000..f336849 --- /dev/null +++ b/src/ixbrlparse/hookspecs.py @@ -0,0 +1,17 @@ +from typing import List, Type + +import pluggy + +from ixbrlparse.components.transform import ixbrlFormat + +hookimpl = pluggy.HookimplMarker("ixbrlparse") +hookspec = pluggy.HookspecMarker("ixbrlparse") + + +@hookspec +def ixbrl_add_formats() -> List[Type[ixbrlFormat]]: # type: ignore + """Add new formats to the ixbrlparse library. + + Returns: + List[[ixbrlFormat]]: A list of ixbrlFormat classes. + """ diff --git a/src/ixbrlparse/plugins.py b/src/ixbrlparse/plugins.py new file mode 100644 index 0000000..911724c --- /dev/null +++ b/src/ixbrlparse/plugins.py @@ -0,0 +1,17 @@ +import importlib + +import pluggy + +from ixbrlparse import hookspecs + +DEFAULT_PLUGINS = ["ixbrlparse.components.formats"] + +pm = pluggy.PluginManager("ixbrlparse") +pm.add_hookspecs(hookspecs) + +pm.load_setuptools_entrypoints("ixbrlparse") + +# Load default plugins +for plugin in DEFAULT_PLUGINS: + mod = importlib.import_module(plugin) + pm.register(mod, plugin) diff --git a/tests/test_classes.py b/tests/test_classes.py index 6cee2e3..6f026e8 100644 --- a/tests/test_classes.py +++ b/tests/test_classes.py @@ -110,7 +110,7 @@ def test_nonnumeric_json(): def test_nonnumeric_schema(): - a = {"context": "", "format_": "", "value": ""} + a = {"context": "", "format_": "", "value": "", "soup_tag": None} x = ixbrlNonNumeric(name="schema:value", **a) assert x.schema == "schema" @@ -118,15 +118,15 @@ def test_nonnumeric_schema(): def test_numeric_value(): - assert ixbrlNumeric(**{"text": "1234"}).value == 1234 - assert ixbrlNumeric(**{"value": "1234"}).value == 1234 + assert ixbrlNumeric(text="1234").value == 1234 + assert ixbrlNumeric(value="1234").value == 1234 def test_numeric_value_error(): with pytest.raises(ValueError): - ixbrlNumeric(**{"text": "1234blahblab"}) + ixbrlNumeric(text="1234blahblab") with pytest.raises(ValueError): - ixbrlNumeric(**{"value": "1234blahblah"}) + ixbrlNumeric(value="1234blahblah") with pytest.raises(ValueError): ixbrlNumeric() @@ -171,102 +171,112 @@ def test_numeric_to_json(): def test_numeric_already_float(): - assert ixbrlNumeric(**{"value": 1234}).value == 1234 - assert ixbrlNumeric(**{"value": 1234.0}).value == 1234 + assert ixbrlNumeric(value=1234).value == 1234 + assert ixbrlNumeric(value=1234.0).value == 1234 def test_numeric_comma_replace(): - assert ixbrlNumeric(**{"text": "1,234"}).value == 1234 - assert ixbrlNumeric(**{"value": "1,234"}).value == 1234 + assert ixbrlNumeric(text="1,234").value == 1234 + assert ixbrlNumeric(value="1,234").value == 1234 def test_numeric_sign(): - assert ixbrlNumeric(**{"text": "1,234", "sign": "-"}).value == -1234 - assert ixbrlNumeric(**{"value": "1,234", "sign": "-"}).value == -1234 - assert ixbrlNumeric(**{"value": "1,234", "sign": ""}).value == 1234 + assert ixbrlNumeric(text="1,234", sign="-").value == -1234 + assert ixbrlNumeric(value="1,234", sign="-").value == -1234 + assert ixbrlNumeric(value="1,234", sign="").value == 1234 def test_numeric_blank(): - assert ixbrlNumeric(**{"value": "-"}).value == 0 - assert ixbrlNumeric(**{"text": "-"}).value == 0 + assert ixbrlNumeric(value="-").value == 0 + assert ixbrlNumeric(text="-").value == 0 def test_numeric_scale(): - assert ixbrlNumeric(**{"value": "1,234", "scale": "0"}).value == 1234 - assert ixbrlNumeric(**{"value": "1,234", "scale": "1"}).value == 12340 - assert ixbrlNumeric(**{"text": "1,234", "scale": "2"}).value == 123400 + assert ixbrlNumeric(value="1,234", scale="0").value == 1234 + assert ixbrlNumeric(value="1,234", scale="1").value == 12340 + assert ixbrlNumeric(text="1,234", scale="2").value == 123400 def test_numeric_scale_sign(): - assert ixbrlNumeric(**{"value": "1,234", "scale": "3", "sign": "-"}).value == -1234000 - assert ixbrlNumeric(**{"text": "1,234", "scale": "3", "sign": "-"}).value == -1234000 + assert ixbrlNumeric(value="1,234", scale="3", sign="-").value == -1234000 + assert ixbrlNumeric(text="1,234", scale="3", sign="-").value == -1234000 def test_numeric_inf_format(): - assert ixbrlNumeric(**{"text": "1234", "decimals": "INF"}).value == 1234 + assert ixbrlNumeric(text="1234", decimals="INF").value == 1234 def test_format_zerodash(): - assert ixbrlNumeric(**{"text": "-", "format": "zerodash"}).value == 0 - assert ixbrlNumeric(**{"text": "-", "format": "numdash"}).value == 0 - assert ixbrlNumeric(**{"text": "-", "format": "numdotdecimal"}).value == 0 + assert ixbrlNumeric(text="-", format="zerodash").value == 0 + assert ixbrlNumeric(text="-", format="numdash").value == 0 + assert ixbrlNumeric(text="-", format="numdotdecimal").value == 0 def test_format_nocontent(): - assert ixbrlNumeric(**{"text": "-", "format": "nocontent"}).value is None - assert ixbrlNumeric(**{"text": "-", "format": "fixed-empty"}).value is None + assert ixbrlNumeric(text="-", format="nocontent").value is None + assert ixbrlNumeric(text="-", format="fixed-empty").value is None def test_format_fixed_true(): - assert ixbrlNumeric(**{"text": "-", "format": "fixed-true"}).value is True - assert ixbrlNumeric(**{"text": "-", "format": "booleantrue"}).value is True + assert ixbrlNumeric(text="-", format="fixed-true").value is True + assert ixbrlNumeric(text="-", format="booleantrue").value is True def test_format_fixed_false(): - assert ixbrlNumeric(**{"text": "-", "format": "fixed-false"}).value is False - assert ixbrlNumeric(**{"text": "-", "format": "booleanfalse"}).value is False + assert ixbrlNumeric(text="-", format="fixed-false").value is False + assert ixbrlNumeric(text="-", format="booleanfalse").value is False def test_format_numdotdecimal(): - assert ixbrlNumeric(**{"text": "1234.12", "format": "numdotdecimal"}).value == 1234.12 - assert ixbrlNumeric(**{"text": "1234", "format": "numdotdecimal"}).value == 1234 - assert ixbrlNumeric(**{"text": "1234.34", "format": "numcommadot"}).value == 1234.34 - assert ixbrlNumeric(**{"text": "1234.45", "format": "numspacedot"}).value == 1234.45 - assert ixbrlNumeric(**{"text": "1,234.45", "format": "numspacedot"}).value == 1234.45 - assert ixbrlNumeric(**{"text": "1234.12", "format": "num-dot-decimal"}).value == 1234.12 + assert ixbrlNumeric(text="1234.12", format="numdotdecimal").value == 1234.12 + assert ixbrlNumeric(text="1234", format="numdotdecimal").value == 1234 + assert ixbrlNumeric(text="1234.34", format="numcommadot").value == 1234.34 + assert ixbrlNumeric(text="1234.45", format="numspacedot").value == 1234.45 + assert ixbrlNumeric(text="1,234.45", format="numspacedot").value == 1234.45 + assert ixbrlNumeric(text="1234.12", format="num-dot-decimal").value == 1234.12 def test_format_numcomma(): - assert ixbrlNumeric(**{"text": "1234,12", "format": "numcomma"}).value == 1234.12 - assert ixbrlNumeric(**{"text": "1234", "format": "numcomma"}).value == 1234 - assert ixbrlNumeric(**{"text": "1234,34", "format": "numcomma"}).value == 1234.34 - assert ixbrlNumeric(**{"text": "1234,45", "format": "numcomma"}).value == 1234.45 - assert ixbrlNumeric(**{"text": "1.234,45", "format": "numcomma"}).value == 1234.45 - assert ixbrlNumeric(**{"text": "1234,12", "format": "numcomma"}).value == 1234.12 + assert ixbrlNumeric(text="1234,12", format="numcomma").value == 1234.12 + assert ixbrlNumeric(text="1234", format="numcomma").value == 1234 + assert ixbrlNumeric(text="1234,34", format="numcomma").value == 1234.34 + assert ixbrlNumeric(text="1234,45", format="numcomma").value == 1234.45 + assert ixbrlNumeric(text="1.234,45", format="numcomma").value == 1234.45 + assert ixbrlNumeric(text="1234,12", format="numcomma").value == 1234.12 def test_format_numwordsen(): assert ( ixbrlNumeric( - **{ - "text": "one thousand two hundred and thirty four", - "format": "numwordsen", - } + text="one thousand two hundred and thirty four", + format="numwordsen", ).value == 1234 ) - assert ixbrlNumeric(**{"text": "eight", "format": "numwordsen"}).value == 8 - assert ixbrlNumeric(**{"text": 8, "format": "numwordsen"}).value == 8 - assert ixbrlNumeric(**{"text": "Eight", "format": "numwordsen"}).value == 8 + assert ixbrlNumeric(text="eight", format="numwordsen").value == 8 + assert ixbrlNumeric(text=8, format="numwordsen").value == 8 + assert ixbrlNumeric(text="Eight", format="numwordsen").value == 8 assert ( ixbrlNumeric( - **{ - "text": "one thousand two hundred and thirty four point four five", - "format": "numwordsen", - } + text="one thousand two hundred and thirty four point four five", + format="numwordsen", ).value == 1234.45 ) - assert ixbrlNumeric(**{"text": "no", "format": "numwordsen"}).value == 0 - assert ixbrlNumeric(**{"text": "None", "format": "numwordsen"}).value == 0 - assert ixbrlNumeric(**{"text": "none", "format": "numwordsen"}).value == 0 + assert ixbrlNumeric(text="no", format="numwordsen").value == 0 + assert ixbrlNumeric(text="None", format="numwordsen").value == 0 + assert ixbrlNumeric(text="none", format="numwordsen").value == 0 + + +def test_format_dates(): + assert ixbrlNonNumeric(value="20 September 2020", format_="datelonguk").value == datetime.date(2020, 9, 20) + assert ixbrlNonNumeric(value="20 September 2020", format_="datedaymonthyearen").value == datetime.date(2020, 9, 20) + assert ixbrlNonNumeric(value="20th September 2020", format_="datedaymonthyearen").value == datetime.date( + 2020, 9, 20 + ) + assert ixbrlNonNumeric(value="20.9.20", format_="datedaymonthyear").value == datetime.date(2020, 9, 20) + + +def test_format_notimplemented(): + with pytest.warns(): + assert ixbrlNonNumeric(value="blahdeblah", format_="blahdeblah").value == "blahdeblah" diff --git a/tests/test_formats.py b/tests/test_formats.py new file mode 100644 index 0000000..4f8f91a --- /dev/null +++ b/tests/test_formats.py @@ -0,0 +1,35 @@ +from datetime import date + +import pytest + +from ixbrlparse.components.formats import ixtDateFormat, ixtNumComma, ixtNumWordsEn + + +def test_ixbrl_date_format(): + f = ixtDateFormat("dateformat") + assert f.parse_value("2019-01-01") == date(2019, 1, 1) + + with pytest.warns(): + assert f.parse_value(1234) is None + + with pytest.raises(ValueError): + assert f.parse_value("04/05/2019") is None + + +def test_ixtnumwordsen(): + f = ixtNumWordsEn("format") + assert f.parse_value("no") == 0 + assert f.parse_value("eighty-five") == 85.0 + + with pytest.raises(ValueError): + assert f.parse_value("blurdy-burg") is None + + +def test_ixtnumcomma(): + f = ixtNumComma("format") + assert f.parse_value("0") == 0 + assert f.parse_value("85") == 85.0 + assert f.parse_value("85.123") == 85123.0 + + with pytest.raises(ValueError): + assert f.parse_value("blurdy-burg") is None diff --git a/tests/test_parse.py b/tests/test_parse.py index f3daa40..b6a8735 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -312,7 +312,7 @@ def test_exclude(): value_seen = False for n in x.nonnumeric: if n.name == "BalanceSheetDate": - assert n.value == "31 July 2022" + assert n.value == date(2022, 7, 31) value_seen = True assert value_seen