Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make excelparser that converts a filled excel sheet to an ontology #309

Merged
merged 34 commits into from
Dec 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
8b248fd
Added first draft of excel2onto
francescalb Nov 26, 2021
eab0472
Dummy example for make_microstructure_onto.py added.
francescalb Dec 1, 2021
e7e2df3
Added module excelparser which is used by excel2onto
francescalb Dec 1, 2021
cf035af
Merge branch 'master' into flb/close-302-excelparser
francescalb Dec 1, 2021
8db19da
Added choice for sheetname for Concepts
francescalb Dec 1, 2021
9c621c5
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 1, 2021
dd5794a
Cleaned up dataframe to be analysed
francescalb Dec 1, 2021
7db9a8e
Added option to read base_iri from metadata
francescalb Dec 1, 2021
991278f
Added import ontologies from excel
francescalb Dec 1, 2021
ed11bcf
Try to fix documentation
francescalb Dec 2, 2021
b0d9362
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 7, 2021
8d69516
Fixed documentation error
francescalb Dec 7, 2021
912ad2b
Added pandas to requirements
francescalb Dec 7, 2021
ce0476b
Corrected google docstring for returning Tuple with typing
francescalb Dec 7, 2021
9bd9e83
Fixed typing error
francescalb Dec 8, 2021
d746812
Added openpyxl to requirements
francescalb Dec 8, 2021
83f48b8
Added first test for adding metadata authors and contributors
francescalb Dec 8, 2021
b4f436c
Added own function for metadata
francescalb Dec 8, 2021
38199f1
Added more metadata and altLabels
francescalb Dec 8, 2021
b81fd0a
Added altLabel
francescalb Dec 8, 2021
72dd4b0
Corrected so that empty fields are not included
francescalb Dec 8, 2021
9f89d10
owlready2.Ontology->ontopy.Ontology
francescalb Dec 9, 2021
8dca4b2
parent=owl:Thing if parents missing in parser
francescalb Dec 9, 2021
a51dcb1
typo
francescalb Dec 9, 2021
091d2cb
Changed owlready.Ontology to ontopy.ontology.Ontology
francescalb Dec 9, 2021
b3b7c22
Added helper function for adding metadata
francescalb Dec 9, 2021
0e62f86
Use function for adding literals also on concepts
francescalb Dec 10, 2021
b527856
_add_literal/_parse_literal accespt pd.Series and pd.DataFrame for data
francescalb Dec 10, 2021
9209a4a
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 10, 2021
8a59622
Added a test for the excel_parser
francescalb Dec 10, 2021
c44a63b
Corrected path in test
francescalb Dec 10, 2021
862eabe
Added creator instead of author and changed excel2onto arg to --output
francescalb Dec 10, 2021
b9ec0a1
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 10, 2021
ee6621e
Updated excelparser ontology for comparison
francescalb Dec 10, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/api_reference/ontopy/excelparser.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# excelparser

::: ontopy.excelparser
10 changes: 10 additions & 0 deletions examples/ontology-from-excel/make_microstructure_onto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
python example for creating ontology from excel
"""
from ontopy.excelparser import create_ontology_from_excel
from ontopy.utils import write_catalog

ontology, catalog = create_ontology_from_excel("tool/onto.xlsx")

ontology.save("microstructure_ontology.ttl", format="turtle", overwrite=True)
write_catalog(catalog)
Binary file added examples/ontology-from-excel/tool/onto.xlsx
Binary file not shown.
264 changes: 264 additions & 0 deletions ontopy/excelparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
"""
Module from parsing an excelfile and creating an
ontology from it.

The excelfile is read by pandas and the pandas
dataframe should have column names:
prefLabel, altLabel, Elucidation, Comments, Examples,
subClassOf, Relations.

Note that correct case is mandatory.
"""
import warnings
from typing import Tuple, Union
import pyparsing
import pandas as pd
import ontopy
from ontopy import World, get_ontology
from ontopy.utils import NoSuchLabelError
from ontopy.manchester import evaluate
import owlready2 # pylint: disable=C0411


def english(string):
"""Returns `string` as an English location string."""
return owlready2.locstr(string, lang="en")


def create_ontology_from_excel( # pylint: disable=too-many-arguments
excelpath: str,
concept_sheet_name: str = "Concepts",
metadata_sheet_name: str = "Metadata",
base_iri: str = "http://emmo.info/emmo/domain/onto#",
base_iri_from_metadata: bool = True,
catalog: dict = None,
) -> Tuple[ontopy.ontology.Ontology, dict]:
"""
Creates an ontology from an excelfile.

Catalog is dict of imported ontologies with key name and value path.
"""
# Read datafile TODO: Some magic to identify the header row
conceptdata = pd.read_excel(
excelpath, sheet_name=concept_sheet_name, skiprows=[0, 2]
)
metadata = pd.read_excel(excelpath, sheet_name=metadata_sheet_name)
return create_ontology_from_pandas(
conceptdata, metadata, base_iri, base_iri_from_metadata, catalog
)


def create_ontology_from_pandas( # pylint: disable=too-many-locals,too-many-branches,too-many-statements
data: pd.DataFrame,
metadata: pd.DataFrame,
base_iri: str = "http://emmo.info/emmo/domain/onto#",
base_iri_from_metadata: bool = True,
catalog: dict = None,
) -> Tuple[ontopy.ontology.Ontology, dict]:
"""
Create an ontology from a pandas DataFrame.
"""

# Remove Concepts without prefLabel and make all to string
data = data[data["prefLabel"].notna()]
data = data.astype({"prefLabel": "str"})

# Make new ontology
world = World()
onto = world.get_ontology(base_iri)

onto, catalog = get_metadata_from_dataframe(metadata, onto)

# base_iri from metadata if it exists and base_iri_from_metadata
if not base_iri_from_metadata:
onto.base_iri = base_iri

onto.sync_python_names()
with onto:
# loop through the rows until no more are added
new_loop = True
final_loop = False
while new_loop:
number_of_added_classes = 0
for _, row in data.iterrows():
name = row["prefLabel"]
try:
if isinstance(
onto.get_by_label(name), owlready2.ThingClass
):
continue
except NoSuchLabelError:
pass

parent_names = str(row["subClassOf"]).split(";")

try:
parents = [onto.get_by_label(pn) for pn in parent_names]
except NoSuchLabelError:
if final_loop is True:
parents = owlready2.Thing

warnings.warn(
"Missing at least one of the defined parents. "
f"Concept: {name}; Defined parents: {parent_names}"
)
new_loop = False
else:
continue

concept = onto.new_entity(name, parents)
# Add elucidation
_add_literal(
row, concept.elucidation, "Elucidation", only_one=True
)

# Add examples
_add_literal(row, concept.example, "Examples")

# Add comments
_add_literal(row, concept.comment, "Comments")

# Add altLAbels
_add_literal(row, concept.altLabel, "altLabel")

number_of_added_classes += 1

if number_of_added_classes == 0:
final_loop = True

# Add properties in a second loop
for _, row in data.iterrows():
properties = row["Relations"]
if isinstance(properties, str):
try:
concept = onto.get_by_label(row["prefLabel"])
except NoSuchLabelError:
pass
props = properties.split(";")
for prop in props:
try:
concept.is_a.append(evaluate(onto, prop))
except pyparsing.ParseException as err:
warnings.warn(
f"Error in Property assignment for: {concept}. "
f"Property to be Evaluated: {prop}. "
f"Error is {err}."
)

# Synchronise Python attributes to ontology
onto.sync_attributes(
name_policy="uuid", name_prefix="EMMO_", class_docstring="elucidation"
)
onto.dir_label = False
return onto, catalog


def get_metadata_from_dataframe( # pylint: disable=too-many-locals,too-many-branches,too-many-statements
metadata: pd.DataFrame,
onto: owlready2.Ontology = None,
base_iri_from_metadata: bool = True,
catalog: dict = None,
) -> Tuple[ontopy.ontology.Ontology, dict]:
"""
Populate ontology with metada from pd.DataFrame
"""

if onto is None:
onto = get_ontology()

# base_iri from metadata if it exists and base_iri_from_metadata
if base_iri_from_metadata:
try:
base_iris = _parse_literal(metadata, "Ontology IRI", metadata=True)
if len(base_iris) > 1:
warnings.warn(
"More than one Ontology IRI given. The first was chosen."
)
base_iri = base_iris[0] + "#"
onto.base_iri = base_iri
except (TypeError, ValueError, AttributeError):
pass

# Get imported ontologies from metadata
try:
imported_ontology_paths = _parse_literal(
metadata,
"Imported ontologies",
metadata=True,
)
except (TypeError, ValueError, AttributeError):
imported_ontology_paths = []
# Add imported ontologies
catalog = {} if catalog is None else catalog
for path in imported_ontology_paths:
imported = onto.world.get_ontology(path).load()
onto.imported_ontologies.append(imported)
catalog[imported.base_iri.rstrip("/")] = path

with onto:
# Add title
_add_literal(
metadata, onto.metadata.title, "Title", metadata=True, only_one=True
)

# Add license
_add_literal(metadata, onto.metadata.license, "License", metadata=True)

# Add authors onto.metadata.author does not work!
_add_literal(metadata, onto.metadata.creator, "Author", metadata=True)

# Add contributors
_add_literal(
metadata, onto.metadata.contributor, "Contributor", metadata=True
)

# Add versionInfo
_add_literal(
metadata,
onto.metadata.versionInfo,
"Ontology version Info",
metadata=True,
only_one=True,
)

return onto, catalog


def _parse_literal(
data: Union[pd.DataFrame, pd.Series],
name: str,
metadata: bool = False,
sep: str = ";",
) -> list:
"""Helper function to make list ouf strings from ';'-delimited
strings in one string.
"""

if metadata is True:
values = data.loc[data["Metadata name"] == name]["Value"].item()
else:
values = data[name]
if not pd.isna(values):
return str(values).split(sep)
return values.split(sep)


def _add_literal( # pylint: disable=too-many-arguments
data: Union[pd.DataFrame, pd.Series],
destination: owlready2.prop.IndividualValueList, #
name: str,
metadata: bool = False,
only_one: bool = False,
sep: str = ";",
) -> None:
try:
name_list = _parse_literal(data, name, metadata=metadata, sep=sep)
if only_one is True and len(name_list) > 1:
warnings.warn(
f"More than one {name} is given. The first was chosen."
)
destination.append(english(name_list[0]))
else:
destination.extend([english(nm) for nm in name_list])
except (TypeError, ValueError, AttributeError):
warnings.warn(f"No {name} added.")
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ blessings>=1.7,<2
Cython>=0.29.21,<0.30
defusedxml>=0.7.1,<1
graphviz>=0.16,<0.20
openpyxl>=3.0.9,<3.1
Owlready2>=0.28,<0.36,!=0.32,!=0.34
packaging>=21.0<22
pandas>=1.2,<1.4
pydot>=1.4.1,<2
Pygments>=2.7.4,<3
pyparsing>=2.4.7
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def fglob(patt):
"tools/emmocheck",
"tools/ontoconvert",
"tools/ontoversion",
"tools/excel2onto",
],
package_data={
"ontopy.factpluspluswrapper.java.lib.so": ["*"],
Expand Down
17 changes: 17 additions & 0 deletions tests/test_excelparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from ontopy import get_ontology
from ontopy.excelparser import create_ontology_from_excel
from ontopy.utils import write_catalog


def test_excelparser(repo_dir: "Path") -> None:
ontopath = (
repo_dir / "tests" / "testonto" / "excelparser" / "fromexcelonto.ttl"
)
onto = get_ontology(str(ontopath)).load()
xlspath = repo_dir / "tests" / "testonto" / "excelparser" / "onto.xlsx"
ontology, catalog = create_ontology_from_excel(xlspath)
assert onto == ontology


if __name__ == "__main__":
test_excelparser()
6 changes: 6 additions & 0 deletions tests/testonto/excelparser/catalog-v001.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<catalog prefer="public" xmlns="urn:oasis:names:tc:entity:xmlns:xml:catalog">
<group id="Folder Repository, directory=, recursive=true, Auto-Update=false, version=2" prefer="public" xml:base="">
<uri name="https://raw.githubusercontent.com/emmo-repo/emmo-repo.github.io/master/versions/1.0.0-beta/emmo-inferred-chemistry" uri="https://raw.githubusercontent.com/emmo-repo/emmo-repo.github.io/master/versions/1.0.0-beta/emmo-inferred-chemistry2.ttl"/>
</group>
</catalog>
Loading