Merge pull request #29 from BojarLab/dev

merge for version 1.0 update
BojarLab · Dec 4, 2023 · 044e18d · 044e18d
2 parents 3a5a537 + 19bf1bd
commit 044e18d
Show file tree

Hide file tree

Showing 62 changed files with 81,123 additions and 92,800 deletions.
diff --git a/00_core.ipynb b/00_core.ipynb
diff --git a/01_glycan_data.ipynb b/01_glycan_data.ipynb
diff --git a/02_ml.ipynb b/02_ml.ipynb
diff --git a/03_motif.ipynb b/03_motif.ipynb
diff --git a/04_network.ipynb b/04_network.ipynb
diff --git a/05_examples.ipynb b/05_examples.ipynb
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -36,9 +36,8 @@ nbdev_prepare
 * Docs are automatically created from the notebooks in the nbs folder.
 
 
-## Wishlist for future glycowork updates (last update: 2023-04-21)
+## Wishlist for future glycowork updates (last update: 2023-12-04)
 #### Urgent
-* add functions to convert other nomenclatures (WURCS, GlycoCT, etc.) into IUPAC-condensed
 * more, and more informative, error messages
 
 
@@ -47,3 +46,4 @@ nbdev_prepare
 * characterize_monosaccharide only factors in subsequent sequence context; make it possible (as an option) to also consider upstream sequence context
 * allow users to specify their own deep learning architecture in ml.models
 * implement multiple sequence alignment and other substitution matrices
+* parallelize motif matching
diff --git a/README.md b/README.md
diff --git a/_proc/.quarto/idx/00_core.ipynb.json b/_proc/.quarto/idx/00_core.ipynb.json
diff --git a/_proc/.quarto/xref/2c7d6167 b/_proc/.quarto/xref/2c7d6167
diff --git a/_proc/.quarto/xref/INDEX b/_proc/.quarto/xref/INDEX
@@ -1,5 +1,5 @@
 {
   "index.ipynb": {
-    "README.md": "2c7d6167"
+    "README.md": "f518931d"
   }
 }
diff --git a/_proc/00_core.ipynb b/_proc/00_core.ipynb
diff --git a/_proc/01_glycan_data.ipynb b/_proc/01_glycan_data.ipynb
diff --git a/_proc/02_ml.ipynb b/_proc/02_ml.ipynb
diff --git a/_proc/03_motif.ipynb b/_proc/03_motif.ipynb
diff --git a/_proc/04_network.ipynb b/_proc/04_network.ipynb
diff --git a/_proc/05_examples.ipynb b/_proc/05_examples.ipynb
diff --git a/_proc/CONTRIBUTING.md b/_proc/CONTRIBUTING.md
@@ -36,9 +36,8 @@ nbdev_prepare
 * Docs are automatically created from the notebooks in the nbs folder.
 
 
-## Wishlist for future glycowork updates (last update: 2023-04-21)
+## Wishlist for future glycowork updates (last update: 2023-12-04)
 #### Urgent
-* add functions to convert other nomenclatures (WURCS, GlycoCT, etc.) into IUPAC-condensed
 * more, and more informative, error messages
 
 
@@ -47,3 +46,4 @@ nbdev_prepare
 * characterize_monosaccharide only factors in subsequent sequence context; make it possible (as an option) to also consider upstream sequence context
 * allow users to specify their own deep learning architecture in ml.models
 * implement multiple sequence alignment and other substitution matrices
+* parallelize motif matching
diff --git a/_proc/_docs/index_files/figure-commonmark/cell-3-output-1.svg b/_proc/_docs/index_files/figure-commonmark/cell-3-output-1.svg
diff --git a/_proc/index.ipynb b/_proc/index.ipynb
diff --git a/_proc/settings.ini b/_proc/settings.ini
@@ -3,7 +3,7 @@ lib_name = glycowork
 repo_name = glycowork
 host = github
 user = BojarLab
-version = 0.8.0
+version = 1.0.0
 min_python = 3.8
 audience = Developers
 custom_sidebar = False

diff --git a/_proc/setup b/_proc/setup
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 
 setuptools.setup(
     name="glycowork",
-    version="0.8.0",
+    version="1.0.0",
     author="Daniel Bojar",
     author_email="daniel.bojar@gu.se",
     description="Package for processing and analyzing glycans",

diff --git a/build/lib/glycowork/__init__.py b/build/lib/glycowork/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.8.0"
+__version__ = "1.0.0"
 #from .glycowork import *
 
 __all__ = ['alignment', 'ml', 'motif', 'glycan_data']
diff --git a/build/lib/glycowork/glycan_data/data_entry.py b/build/lib/glycowork/glycan_data/data_entry.py
@@ -1,9 +1,9 @@
 from glycowork.glycan_data.loader import lib
-from glycowork.motif.tokenization import check_nomenclature
+from glycowork.motif.processing import check_nomenclature
 from glycowork.motif.graph import glycan_to_nxGraph, compare_glycans
 
 
-def check_presence(glycan, df, colname = 'target', libr = None,
+def check_presence(glycan, df, colname = 'glycan', libr = None,
                    name = None, rank = 'Species', fast = False):
   """checks whether glycan (of that species) is already present in dataset\n
   | Arguments:
@@ -20,20 +20,20 @@ def check_presence(glycan, df, colname = 'target', libr = None,
   """
   if libr is None:
     libr = lib
-  if check_nomenclature(glycan):
-    if name is not None:
-      name = name.replace(" ", "_")
-      df = df[df[rank] == name]
-      if len(df) == 0:
-        print("This is the best: %s is not in dataset" % name)
-    if fast:
-      ggraph = glycan_to_nxGraph(glycan, libr = libr)
-      check_all = [compare_glycans(ggraph, k, libr = libr) for k in df.graph]
-    else:
-      check_all = [compare_glycans(glycan, k, libr = libr) for k in df[colname]]
-    if any(check_all):
-      print("Glycan already in dataset.")
-    else:
-      print("It's your lucky day, this glycan is new!")
+  if any([p in glycan for p in ['RES', '=']]) or not isinstance(glycan, str):
+    check_nomenclature(glycan)
+    return
+  if name is not None:
+    name = name.replace(" ", "_")
+    df = df[df[rank] == name]
+    if len(df) == 0:
+      print("This is the best: %s is not in dataset" % name)
+  if fast:
+    ggraph = glycan_to_nxGraph(glycan, libr = libr)
+    check_all = [compare_glycans(ggraph, k, libr = libr) for k in df.graph]
   else:
-    print("Glycan not correctly formatted.")
+    check_all = [compare_glycans(glycan, k, libr = libr) for k in df[colname]]
+  if any(check_all):
+    print("Glycan already in dataset.")
+  else:
+    print("It's your lucky day, this glycan is new!")
diff --git a/build/lib/glycowork/glycan_data/glycan_motifs.csv b/build/lib/glycowork/glycan_data/glycan_motifs.csv
diff --git a/build/lib/glycowork/glycan_data/loader.py b/build/lib/glycowork/glycan_data/loader.py
@@ -1,19 +1,21 @@
 import pandas as pd
+import re
 import os
-import ast
 import pickle
 import itertools
 import pkg_resources
 
-io = pkg_resources.resource_stream(__name__, "v8_sugarbase.csv")
-df_glycan = pd.read_csv(io)
+io = pkg_resources.resource_stream(__name__, "v9_df_species.csv")
+df_species = pd.read_csv(io)
 io = pkg_resources.resource_stream(__name__, "glycan_motifs.csv")
 motif_list = pd.read_csv(io)
 io = pkg_resources.resource_stream(__name__, "glycan_binding.csv")
 glycan_binding = pd.read_csv(io)
 this_dir, this_filename = os.path.split(__file__)  # Get path of data.pkl
 data_path = os.path.join(this_dir, 'lib_v8.pkl')
 lib = pickle.load(open(data_path, 'rb'))
+data_path = os.path.join(this_dir, 'v9_sugarbase.pkl')
+df_glycan = pickle.load(open(data_path, 'rb'))
 
 linkages = {
   '1-4', '1-6', 'a1-1', 'a1-2', 'a1-3', 'a1-4', 'a1-5', 'a1-6', 'a1-7', 'a1-8', 'a1-9', 'a1-11', 'a1-?', 'a2-1', 'a2-2', 'a2-3', 'a2-4', 'a2-5', 'a2-6', 'a2-7', 'a2-8', 'a2-9',
@@ -52,6 +54,71 @@ def find_nth(haystack, needle, n):
   return start
 
 
+def find_nth_reverse(string, substring, n, ignore_branches = False):
+  # Reverse the string and the substring
+  reversed_string = string[::-1]
+  reversed_substring = substring[::-1]
+  # Initialize the start index for the search
+  start_index = 0
+  # Loop to find the n-th occurrence
+  for i in range(n):
+    # Find the next occurrence index
+    idx = reversed_string.find(reversed_substring, start_index)
+    # If the substring is not found, return -1
+    if idx == -1:
+      return -1
+    # Update the start index
+    start_index = idx + len(substring)
+  # Calculate and return the original starting index
+  original_start_index = len(string) - start_index
+  if ignore_branches:
+    # Check if there is an immediate branch preceding the match
+    branch_end_idx = original_start_index - 1
+    if branch_end_idx > 0 and string[branch_end_idx] == ']' and string[branch_end_idx - 1] != '[':
+      # Find the start of the immediate branch
+      bracket_count = 1
+      for i in range(branch_end_idx - 1, -1, -1):
+        if string[i] == ']':
+          bracket_count += 1
+        elif string[i] == '[':
+          bracket_count -= 1
+        if bracket_count == 0:
+          original_start_index = i
+          break
+  return original_start_index
+
+
+def remove_unmatched_brackets(s):
+  """Removes all unmatched brackets from the string s.\n
+  | Arguments:
+  | :-
+  | s (string): glycan string in IUPAC-condensed\n
+  | Returns:
+  | :-
+  | Returns glycan without unmatched brackets
+   """
+  while True:
+    # Keep track of the indexes of the brackets
+    stack = []
+    unmatched_open = set()
+    unmatched_close = set()
+    for i, char in enumerate(s):
+      if char == '[':
+        stack.append(i)
+      elif char == ']':
+        if stack:
+          stack.pop()
+        else:
+          unmatched_close.add(i)
+    unmatched_open.update(stack)
+    # If there are no unmatched brackets, break the loop
+    if not unmatched_open and not unmatched_close:
+      break
+    # Build a new string without the unmatched brackets
+    s = ''.join([char for i, char in enumerate(s) if i not in unmatched_open and i not in unmatched_close])
+  return s
+
+
 def reindex(df_new, df_old, out_col, ind_col, inp_col):
   """Returns columns values in order of new dataframe rows\n
   | Arguments:
@@ -80,7 +147,7 @@ def stringify_dict(dicty):
   | Returns string of type key:value for sorted items
   """
   dicty = dict(sorted(dicty.items()))
-  return ''.join(str(key) + str(value) for key, value in dicty.items())
+  return ''.join(f"{key}{value}" for key, value in dicty.items())
 
 
 def replace_every_second(string, old_char, new_char):
@@ -95,17 +162,14 @@ def replace_every_second(string, old_char, new_char):
   | Returns string with replaced characters
   """
   count = 0
-  result = ""
+  result = []
   for char in string:
     if char == old_char:
       count += 1
-      if count % 2 == 0:
-        result += new_char
-      else:
-        result += char
+      result.append(new_char if count % 2 == 0 else char)
     else:
-      result += char
-  return result
+      result.append(char)
+  return ''.join(result)
 
 
 def multireplace(string, remove_dic):
@@ -144,14 +208,9 @@ def build_custom_df(df, kind = 'df_species'):
   cols = kind_to_cols.get(kind, None)
   if cols is None:
     raise ValueError("Invalid value for 'kind' argument, only df_species, df_tissue, and df_disease are supported.")
-  df = df.loc[df[cols[1]].str.len() > 2, cols]
+  df = df.loc[df[cols[1]].str.len() > 0, cols]
   df.set_index('glycan', inplace = True)
-  df.index.name = 'target'
-  df = df.applymap(ast.literal_eval)
   df = df.explode(cols[1:]).reset_index()
-  df.sort_values([cols[1], 'target'], ascending = [True, True], inplace = True)
+  df.sort_values([cols[1], 'glycan'], ascending = [True, True], inplace = True)
   df.reset_index(drop = True, inplace = True)
   return df
-
-
-df_species = build_custom_df(df_glycan, kind = 'df_species')
diff --git a/build/lib/glycowork/ml/processing.py b/build/lib/glycowork/ml/processing.py
@@ -28,8 +28,8 @@ def dataset_to_graphs(glycan_list, labels, libr = None, label_type = torch.long)
   # Converting graphs to Pytorch Geometric Data objects
   data = [from_networkx(k) for k in glycan_graphs]
   # Adding graph labels
-  for k in range(len(labels)):
-    data[k].y = torch.tensor(labels[k])
+  for data_obj, label in zip(data, labels):
+    data_obj.y = torch.tensor(label, dtype = label_type)
   return data
 
 
@@ -58,13 +58,10 @@ def dataset_to_dataloader(glycan_list, labels, libr = None, batch_size = 32,
                                     libr = libr, label_type = label_type)
   # Adding (optional) extra feature to the Data objects
   if extra_feature is not None:
-    for k in range(len(glycan_graphs)):
-      glycan_graphs[k].train_idx = torch.tensor(extra_feature[k],
-                                                dtype = torch.float)
+    for graph, feature in zip(glycan_graphs, extra_feature):
+      graph.train_idx = torch.tensor(feature, dtype = torch.float)
   # Generating the dataloader from the data objects
-  glycan_loader = DataLoader(glycan_graphs, batch_size = batch_size,
-                             shuffle = shuffle, drop_last = drop_last)
-  return glycan_loader
+  return DataLoader(glycan_graphs, batch_size = batch_size, shuffle = shuffle, drop_last = drop_last)
 
 
 def split_data_to_train(glycan_list_train, glycan_list_val,

diff --git a/build/lib/glycowork/ml/train_test_split.py b/build/lib/glycowork/ml/train_test_split.py
@@ -18,23 +18,19 @@ def seed_wildcard_hierarchy(glycans, labels, wildcard_list,
   | :-
   | Returns list of glycans (strings) and labels (flexible) where some glycan parts have been replaced with wildcard_name
   """
-  added_glycans = []
-  added_labels = []
-  # Each loop has the chance of exchanging glycan parts with each wildcard
-  for k in range(len(glycans)):
-    temp = glycans[k]
-    for j in wildcard_list:
-      if j in temp:
-        if random.uniform(0, 1) < r:
-          added_glycans.append(temp.replace(j, wildcard_name))
-          added_labels.append(labels[k])
-  glycans += added_glycans
-  labels += added_labels
+  added_glycans_labels = [(glycan.replace(j, wildcard_name), label) 
+                             for glycan, label in zip(glycans, labels) 
+                             for j in wildcard_list 
+                             if j in glycan and random.uniform(0, 1) < r]
+  if added_glycans_labels:
+    added_glycans, added_labels = zip(*added_glycans_labels)
+    return glycans + list(added_glycans), labels + list(added_labels)
+
   return glycans, labels
 
 
 def hierarchy_filter(df_in, rank = 'Domain', min_seq = 5, wildcard_seed = False, wildcard_list = None,
-                     wildcard_name = None, r = 0.1, col = 'target'):
+                     wildcard_name = None, r = 0.1, col = 'glycan'):
   """stratified data split in train/test at the taxonomic level, removing duplicate glycans and infrequent classes\n
   | Arguments:
   | :-
@@ -45,7 +41,7 @@ def hierarchy_filter(df_in, rank = 'Domain', min_seq = 5, wildcard_seed = False,
   | wildcard_list (list): list which glycoletters a wildcard encompasses
   | wildcard_name (string): how the wildcard should be named in the IUPAC-condensed nomenclature
   | r (float): rate of replacement, default:0.1 or 10%
-  | col (string): column name for glycan sequences; default:target\n
+  | col (string): column name for glycan sequences; default:glycan\n
   | Returns:
   | :-
   | Returns train_x, val_x (lists of glycans (strings) after stratified shuffle split)
@@ -68,7 +64,7 @@ def hierarchy_filter(df_in, rank = 'Domain', min_seq = 5, wildcard_seed = False,
   # For each class in rank, get unique set of glycans
   for i in range(len(class_list)):
     t = df[df[rank] == class_list[i]]
-    t = t.drop_duplicates('target', keep = 'first')
+    t = t.drop_duplicates('glycan', keep = 'first')
     temp.append(t)
   df = pd.concat(temp).reset_index(drop = True)
 
@@ -121,13 +117,13 @@ def general_split(glycans, labels, test_size = 0.2):
                           test_size = test_size, random_state = 42)
 
 
-def prepare_multilabel(df, rank = 'Species', glycan_col = 'target'):
+def prepare_multilabel(df, rank = 'Species', glycan_col = 'glycan'):
   """converts a one row per glycan-species/tissue/disease association file to a format of one glycan - all associations\n
   | Arguments:
   | :-
   | df (dataframe): dataframe where each row is one glycan - species association
   | rank (string): which label column should be used; default:Species
-  | glycan_col (string): column name of where the glycan sequences are stored; default:target\n
+  | glycan_col (string): column name of where the glycan sequences are stored; default:glycan\n
   | Returns:
   | :-
   | (1) list of unique glycans in df