Skip to content

Commit

Permalink
Retrieve only the experimentally characterized cazy (#43)
Browse files Browse the repository at this point in the history
* upgrade poetry

* simplify cli

* add missing characterized option

* update tests

* update version
  • Loading branch information
rvhonorato authored Oct 12, 2023
1 parent cf3657e commit f267586
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 127 deletions.
20 changes: 10 additions & 10 deletions .trunk/trunk.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
version: 0.1
cli:
version: 1.6.1
version: 1.16.2
plugins:
sources:
- id: trunk
ref: v0.0.13
ref: v1.2.1
uri: https://github.com/trunk-io/plugins
lint:
enabled:
- actionlint@1.6.23
- black@23.1.0
- actionlint@1.6.26
- black@23.9.1
- git-diff-check
- gitleaks@8.16.1
- gitleaks@8.18.0
- isort@5.12.0
- markdownlint@0.33.0
- prettier@2.8.4
- ruff@0.0.256
- taplo@0.7.0
- yamllint@1.29.0
- markdownlint@0.37.0
- prettier@3.0.3
- ruff@0.0.292
- taplo@0.8.1
- yamllint@1.32.0
ignore:
- linters: [ALL]
paths:
Expand Down
109 changes: 36 additions & 73 deletions cazy_parser/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,75 +16,52 @@
log.addHandler(ch)
log.setLevel("DEBUG")

# ===========================================================================================================
# Define arguments
ap = argparse.ArgumentParser()

ap.add_argument(
"enzyme_class",
choices=["GH", "GT", "PL", "CA", "AA"],
)

ap.add_argument("-f", "--family", type=int)

ap.add_argument("-s", "--subfamily")

ap.add_argument("-c", "--characterized")

ap.add_argument(
"-v",
"--version",
help="show version",
action="version",
version=f"Running {ap.prog} v{VERSION}",
)


def load_args(ap):
"""
Load argument parser.
Parameters
----------
ap : argparse.ArgumentParser
Argument parser.
# ====================================================================================#
# Main code
def main():
"""Main function."""

Returns
-------
cmd : argparse.Namespace
Parsed command-line arguments.
ap = argparse.ArgumentParser()

"""
return ap.parse_args()
ap.add_argument(
"enzyme_class",
choices=["GH", "GT", "PL", "CA", "AA"],
)

ap.add_argument("-f", "--family", type=int, default=None)

# ====================================================================================#
# Define CLI
def cli(ap, main):
"""
Command-line interface entry point.
ap.add_argument("-s", "--subfamily", type=int, default=None)

Parameters
----------
ap : argparse.ArgumentParser
Argument parser.
main : function
Main function.
ap.add_argument("-c", "--characterized", action="store_true", default=False)

"""
cmd = load_args(ap)
main(**vars(cmd))
ap.add_argument(
"-v",
"--version",
help="show version",
action="version",
version=f"Running {ap.prog} v{VERSION}",
)

args = ap.parse_args()

def maincli():
"""Execute main client."""
cli(ap, main)
if args.enzyme_class not in ENZYME_LIST:
logging.error(f"Enzyme class {args.enzyme_class} not supported")
sys.exit()
else:
enzyme_name = ENZYME_LIST[args.enzyme_class]

id_list = retrieve_genbank_ids(
enzyme_name, args.family, args.subfamily, args.characterized
)

# ====================================================================================#
# Main code
def main(enzyme_class, family, subfamily, characterized):
"""Main function."""
output_fname = f"{args.enzyme_class}"
if args.family:
output_fname += f"{args.family}"
if args.subfamily:
output_fname += f"_{args.subfamily}"
if args.characterized:
output_fname += "_characterized"

log.info("-" * 42)
log.info("")
Expand All @@ -94,20 +71,6 @@ def main(enzyme_class, family, subfamily, characterized):
log.info("")
log.info("-" * 42)

if enzyme_class not in ENZYME_LIST:
logging.error(f"Enzyme class {enzyme_class} not supported")
sys.exit()
else:
enzyme_name = ENZYME_LIST[enzyme_class]

id_list = retrieve_genbank_ids(enzyme_name, family, subfamily, characterized)

output_fname = f"{enzyme_class}"
if family:
output_fname += f"{family}"
if subfamily:
output_fname += f"_{subfamily}"

today = time.strftime("%d%m%Y")
output_fname += f"_{today}.fasta"
try:
Expand All @@ -128,4 +91,4 @@ def main(enzyme_class, family, subfamily, characterized):


if __name__ == "__main__":
sys.exit(maincli())
sys.exit(main())
57 changes: 20 additions & 37 deletions cazy_parser/modules/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import string
import sys
import urllib
import urllib.request
from typing import Optional

import requests
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -176,21 +178,13 @@ def get_data_from_txt(link):
return data_list


def fetch_links(enzyme_class, family, subfamily):
"""
Fetch link structure for an enzyme class.
Parameters
----------
enzyme_class : str
Enzyme class to fetch links for.
Returns
-------
page_list : list
List of links to the pages.
"""
def fetch_links(
enzyme_class: str,
family: Optional[int] = None,
subfamily: Optional[int] = None,
characterized: Optional[bool] = None,
) -> list[str]:
"""Fetch link structure for an enzyme class."""

main_class_link = f"http://www.cazy.org/{enzyme_class}.html"
log.info(f"Fetching links for {enzyme_class}, url: {main_class_link}")
Expand All @@ -205,6 +199,9 @@ def fetch_links(enzyme_class, family, subfamily):
log.info(f"Only using links of family {family}")
family_list = [e for e in family_list if int(e[2:]) == family]

if characterized:
log.info("Only using characterized links")

if not family_list:
log.error("No links were found.")
sys.exit()
Expand Down Expand Up @@ -259,6 +256,9 @@ def fetch_links(enzyme_class, family, subfamily):
else:
page_list.append(page_zero)

if characterized:
page_list = [e for e in page_list if "characterized" in e]

return page_list


Expand Down Expand Up @@ -341,28 +341,11 @@ def fetch_species():
return species_dic


def retrieve_genbank_ids(enzyme_name, family=None, subfamily=None, characterized=None):
"""
Retrieve genbank IDs for a given enzyme.
Parameters
----------
enzyme_name : str
Enzyme name to retrieve genbank IDs for.
family : int
Family number to retrieve genbank IDs for.
subfamily : int
Subfamily number to retrieve genbank IDs for.
characterized : bool
Whether to retrieve genbank IDs for characterized enzymes.
Returns
-------
genbank_id_list : list
List of genbank IDs.
"""
page_list = fetch_links(enzyme_name, family, subfamily)
def retrieve_genbank_ids(
enzyme_name: str, family: int, subfamily: int, characterized: bool
) -> list[str]:
"""Retrieve genbank IDs for a given enzyme."""
page_list = fetch_links(enzyme_name, family, subfamily, characterized)
data = fetch_data(page_list)
genbank_id_list = []
for element in data:
Expand Down
2 changes: 1 addition & 1 deletion cazy_parser/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "2.0.2"
VERSION = "2.0.3"
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ pythonpath = ["src"]

[tool.poetry]
name = "cazy-parser"
version = "2.0.2"
version = "2.0.3"
description = "A way to extract specific information from CAZy"
authors = ["Rodrigo V. Honorato <r.vargashonorato@uu.nl>"]
authors = ["Rodrigo V. Honorato <rvhonorato@protonmail.com>"]
readme = "README.md"
packages = [{ include = "cazy_parser" }]
classifiers = [
Expand Down Expand Up @@ -34,7 +34,7 @@ coverage = "^7.2.5"
hypothesis = "^6.75.1"

[tool.poetry.scripts]
cazy-parser = 'cazy_parser.cli:maincli'
cazy-parser = 'cazy_parser.cli:main'

[build-system]
requires = ["poetry-core"]
Expand Down
18 changes: 15 additions & 3 deletions tests/test_html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import urllib
import urllib.request

import pytest
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -78,7 +78,12 @@ def test_get_data_from_txt():


def test_fetch_links():
observed_links = fetch_links("Carbohydrate-Esterases", family=None, subfamily=None)
observed_links = fetch_links("Carbohydrate-Esterases", characterized=True)

assert "http://www.cazy.org/CE20_characterized.html" in observed_links
assert "http://www.cazy.org/IMG/cazy_data/CE20.txt" not in observed_links

observed_links = fetch_links("Carbohydrate-Esterases", characterized=False)

assert "http://www.cazy.org/CE20_characterized.html" in observed_links
assert "http://www.cazy.org/IMG/cazy_data/CE20.txt" in observed_links
Expand All @@ -99,8 +104,15 @@ def test_fetch_species():

def test_retrieve_genbank_ids():
observed_id_list = retrieve_genbank_ids(
enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1
enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1, characterized=False
)

assert observed_id_list
assert len(observed_id_list) >= 1223

observed_id_list = retrieve_genbank_ids(
enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1, characterized=True
)

assert observed_id_list
assert 36 <= len(observed_id_list) <= 1000

0 comments on commit f267586

Please sign in to comment.