Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make excelparser that converts a filled excel sheet to an ontology #309

Merged
merged 34 commits into from
Dec 12, 2021
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
8b248fd
Added first draft of excel2onto
francescalb Nov 26, 2021
eab0472
Dummy example for make_microstructure_onto.py added.
francescalb Dec 1, 2021
e7e2df3
Added module excelparser which is used by excel2onto
francescalb Dec 1, 2021
cf035af
Merge branch 'master' into flb/close-302-excelparser
francescalb Dec 1, 2021
8db19da
Added choice for sheetname for Concepts
francescalb Dec 1, 2021
9c621c5
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 1, 2021
dd5794a
Cleaned up dataframe to be analysed
francescalb Dec 1, 2021
7db9a8e
Added option to read base_iri from metadata
francescalb Dec 1, 2021
991278f
Added import ontologies from excel
francescalb Dec 1, 2021
ed11bcf
Try to fix documentation
francescalb Dec 2, 2021
b0d9362
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 7, 2021
8d69516
Fixed documentation error
francescalb Dec 7, 2021
912ad2b
Added pandas to requirements
francescalb Dec 7, 2021
ce0476b
Corrected google docstring for returning Tuple with typing
francescalb Dec 7, 2021
9bd9e83
Fixed typing error
francescalb Dec 8, 2021
d746812
Added openpyxl to requirements
francescalb Dec 8, 2021
83f48b8
Added first test for adding metadata authors and contributors
francescalb Dec 8, 2021
b4f436c
Added own function for metadata
francescalb Dec 8, 2021
38199f1
Added more metadata and altLabels
francescalb Dec 8, 2021
b81fd0a
Added altLabel
francescalb Dec 8, 2021
72dd4b0
Corrected so that empty fields are not included
francescalb Dec 8, 2021
9f89d10
owlready2.Ontology->ontopy.Ontology
francescalb Dec 9, 2021
8dca4b2
parent=owl:Thing if parents missing in parser
francescalb Dec 9, 2021
a51dcb1
typo
francescalb Dec 9, 2021
091d2cb
Changed owlready.Ontology to ontopy.ontology.Ontology
francescalb Dec 9, 2021
b3b7c22
Added helper function for adding metadata
francescalb Dec 9, 2021
0e62f86
Use function for adding literals also on concepts
francescalb Dec 10, 2021
b527856
_add_literal/_parse_literal accespt pd.Series and pd.DataFrame for data
francescalb Dec 10, 2021
9209a4a
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 10, 2021
8a59622
Added a test for the excel_parser
francescalb Dec 10, 2021
c44a63b
Corrected path in test
francescalb Dec 10, 2021
862eabe
Added creator instead of author and changed excel2onto arg to --output
francescalb Dec 10, 2021
b9ec0a1
Merge branch 'master' of github.com:emmo-repo/EMMO-python into flb/cl…
francescalb Dec 10, 2021
ee6621e
Updated excelparser ontology for comparison
francescalb Dec 10, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/api_reference/ontopy/excelparser.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# excelparser

::: ontopy.excelparser
10 changes: 10 additions & 0 deletions examples/ontology-from-excel/make_microstructure_onto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
python example for creating ontology from excel
"""
from ontopy.excelparser import create_ontology_from_excel
from ontopy.utils import write_catalog

ontology, catalog = create_ontology_from_excel("tool/onto.xlsx")

ontology.save("microstructure_ontology.ttl", format="turtle", overwrite=True)
write_catalog(catalog)
Binary file added examples/ontology-from-excel/tool/onto.xlsx
Binary file not shown.
267 changes: 267 additions & 0 deletions ontopy/excelparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
#!/usr/bin/env python3
francescalb marked this conversation as resolved.
Show resolved Hide resolved
"""
Module from parsing an excelfile and creating an
ontology from it.

The excelfile is read by pandas and the pandas
dataframe should have column names:
[
francescalb marked this conversation as resolved.
Show resolved Hide resolved
"""
import warnings
from typing import Tuple
import pyparsing
import pandas as pd
from ontopy import World, get_ontology
from ontopy.utils import NoSuchLabelError
from ontopy.manchester import evaluate
import owlready2 # pylint: disable=C0411


def english(string):
"""Returns `string` as an English location string."""
return owlready2.locstr(string, lang="en")


def create_ontology_from_excel( # pylint: disable=too-many-arguments
excelpath: str,
concept_sheet_name: str = "Concepts",
metadata_sheet_name: str = "Metadata",
base_iri: str = "http://emmo.info/emmo/domain/onto#",
base_iri_from_metadata: bool = True,
catalog: dict = None,
) -> Tuple[owlready2.Ontology, dict]:
francescalb marked this conversation as resolved.
Show resolved Hide resolved
"""
Creates an ontology from an excelfile.

catalog is dict of imported ontologies with key name and value path
"""
# Read datafile TODO: Some magic to identify the header row
conceptdata = pd.read_excel(
excelpath, sheet_name=concept_sheet_name, skiprows=[0, 2]
)
metadata = pd.read_excel(excelpath, sheet_name=metadata_sheet_name)
return create_ontology_from_pandas(
conceptdata, metadata, base_iri, base_iri_from_metadata, catalog
)


def create_ontology_from_pandas( # pylint: disable=too-many-locals,too-many-branches,too-many-statements
data: pd.DataFrame,
metadata: pd.DataFrame,
base_iri: str = "http://emmo.info/emmo/domain/onto#",
base_iri_from_metadata: bool = True,
catalog: dict = None,
) -> Tuple[owlready2.Ontology, dict]:
francescalb marked this conversation as resolved.
Show resolved Hide resolved
"""
Create an ontology from a pandas DataFrame
"""

# Remove Concepts without prefLabel and make all to string
data = data[data["prefLabel"].notna()]
data = data.astype({"prefLabel": "str"})

# Make new ontology
world = World()
onto = world.get_ontology(base_iri)

onto, catalog = get_metadata_from_dataframe(metadata, onto)

# base_iri from metadata if it exists and base_iri_from_metadata
if not base_iri_from_metadata:
onto.base_iri = base_iri

# have to decide how to add metadata and imports etc.
# base_iri to be added from excel (maybe also possibly argument?)
# onto = world.get_ontology(base_iri)

onto.sync_python_names()
with onto:
# loop through the rows until no more are added
new_loop = True
final_loop = False
while new_loop:
number_of_added_classes = 0
for _, row in data.iterrows():
name = row["prefLabel"]
try:
if isinstance(
onto.get_by_label(name), owlready2.ThingClass
):
continue
except NoSuchLabelError:
pass

parent_names = str(row["subClassOf"]).split(";")

try:
parents = [onto.get_by_label(pn) for pn in parent_names]
except NoSuchLabelError:
if final_loop is True:
parents = onto.EMMO
francescalb marked this conversation as resolved.
Show resolved Hide resolved

warnings.warn(
"Missing at least one of the defined parents. "
f"Concept: {name}; Defined parents: {parent_names}"
)
new_loop = False
else:
continue

concept = onto.new_entity(name, parents)

elucidation = row["Elucidation"]
francescalb marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(elucidation, str):
concept.elucidation.append(english(elucidation))

examples = row["Examples"]
if isinstance(examples, str):
example_list = examples.split(";")
for example in example_list:
concept.example.append(english(example))

comments = row["Comments"]
if isinstance(comments, str):
comment_list = comments.split(";")
for comment in comment_list:
concept.comment.append(english(comment))

altlabels = row["altLabel"]
if isinstance(altlabels, str):
altlabel_list = altlabels.split(";")
for altlabel in altlabel_list:
concept.altLabel.append(english(altlabel))

number_of_added_classes += 1

if number_of_added_classes == 0:
final_loop = True

# Add properties in a second loop
for _, row in data.iterrows():
properties = row["Relations"]
if isinstance(properties, str):
try:
concept = onto.get_by_label(row["prefLabel"])
except NoSuchLabelError:
pass
props = properties.split(";")
for prop in props:
try:
concept.is_a.append(evaluate(onto, prop))
except pyparsing.ParseException as err:
warnings.warn(
f"Error in Property assignment for: {concept}. "
f"Property to be Evaluated: {prop}. "
f"Error is {err}."
)

onto, catalog = get_metadata_from_dataframe(metadata, onto)
# Synchronise Python attributes to ontology
onto.sync_attributes(
name_policy="uuid", name_prefix="EMMO_", class_docstring="elucidation"
)
onto.dir_label = False

return onto, catalog


# To test: with and without ontology as input
def get_metadata_from_dataframe( # pylint: disable=too-many-locals,too-many-branches,too-many-statements
metadata: pd.DataFrame,
onto: owlready2.Ontology = None,
base_iri_from_metadata: bool = True,
catalog: dict = None,
) -> Tuple[owlready2.Ontology, dict]:
"""
Populate ontology with metada from pd.DataFrame
"""

if onto is None:
onto = get_ontology()

# base_iri from metadata if it exists and base_iri_from_metadata
if base_iri_from_metadata:
try:
base_iris = _parse_metadata_string(metadata, "Ontology IRI")
if len(base_iris) > 1:
warnings.warn(
"More than one Ontology IRI given. " "The first was chosen."
)
base_iri = base_iris[0] + "#"
onto.base_iri = base_iri
except (TypeError, ValueError, AttributeError):
pass

# Get imported ontologies from metadata
try:
imported_ontology_paths = _parse_metadata_string(
metadata, "Imported ontologies"
)
except (TypeError, ValueError, AttributeError):
imported_ontology_paths = []
# Add imported ontologies
catalog = {} if catalog is None else catalog
for path in imported_ontology_paths:
imported = onto.world.get_ontology(path).load()
onto.imported_ontologies.append(imported)
catalog[imported.base_iri.rstrip("/")] = path

# Add title
francescalb marked this conversation as resolved.
Show resolved Hide resolved
try:
titles = _parse_metadata_string(metadata, "Title")
if len(titles) > 1:
warnings.warn(
"More than one title is given. " "The first was chosen."
)
onto.metadata.title.append(english(titles[0]))
except (TypeError, ValueError, AttributeError):
pass

# Add versionINFO
try:
version_infos = _parse_metadata_string(
metadata, "Ontology version Info"
)
if len(titles) > 1:
francescalb marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
"More than one versionINFO is given. " "The first was chosen."
)
onto.metadata.versionInfo.append(english(version_infos[0]))
except (TypeError, ValueError, AttributeError):
pass

# Add versionINFO
try:
licenses = _parse_metadata_string(metadata, "License")
if len(licenses) > 1:
francescalb marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
"More than one license is given. " "The first was chosen."
)
onto.metadata.license.append(english(licenses[0]))
except (TypeError, ValueError, AttributeError):
pass

# Add authors
try:
authors = _parse_metadata_string(metadata, "Author")
for author in authors:
onto.metadata.creator.append(english(author))
except (TypeError, ValueError, AttributeError):
warnings.warn("No authors or creators added.")

# Add contributors
try:
contributors = _parse_metadata_string(metadata, "Contributor")
for contributor in contributors:
onto.metadata.contributor.append(english(contributor))
except (TypeError, ValueError, AttributeError):
warnings.warn("No contributors added.")

return onto, catalog


def _parse_metadata_string(metadata: pd.DataFrame, name: str) -> list:
"""Helper function to make list ouf strings from ';'-delimited
strings in one string.
"""
return metadata.loc[metadata["Metadata name"] == name]["Value"].item().split(";")
francescalb marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ blessings>=1.7,<2
Cython>=0.29.21,<0.30
defusedxml>=0.7.1,<1
graphviz>=0.16,<0.20
openpyxl>=3.0.9,<3.1
Owlready2>=0.28,<0.36,!=0.32,!=0.34
packaging>=21.0<22
pandas>=1.2,<1.4
pydot>=1.4.1,<2
Pygments>=2.7.4,<3
pyparsing>=2.4.7
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def fglob(patt):
"tools/emmocheck",
"tools/ontoconvert",
"tools/ontoversion",
"tools/excel2onto",
],
package_data={
"ontopy.factpluspluswrapper.java.lib.so": ["*"],
Expand Down
46 changes: 46 additions & 0 deletions tools/excel2onto
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""Creates and ontology from an excelfile.

The excel file must be in the formate provided by
ontology_template.xlsx
"""
import argparse
import sys
import os
from ontopy.excelparser import create_ontology_from_excel
from ontopy.utils import write_catalog
import owlready2 # pylint: disable=C0411


def english(string):
"""Returns `string` as an English location string."""
return owlready2.locstr(string, lang="en")


def main():
"""Main run function."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"excelpath",
help="path to excel book",
)
parser.add_argument(
"--name",
"-n",
default="ontology.ttl",
help="Name of ontology, ´ontology.ttl´ is default",
)
try:
args = parser.parse_args()
except SystemExit as exc:
sys.exit(exc.code) # Exit without traceback on invalid arguments

ontology, catalog = create_ontology_from_excel(args.excelpath)

# Save new ontology as turtle
ontology.save(os.path.join(args.name), format="turtle", overwrite=True)
write_catalog(catalog)


if __name__ == "__main__":
main()