From 8a36af0c1af2c2b39e841cea2496f7a367ffdae5 Mon Sep 17 00:00:00 2001
From: Anthony Bretaudeau <anthony.bretaudeau@inria.fr>
Date: Wed, 15 Feb 2023 16:53:12 +0100
Subject: [PATCH] Fix error with biopython 1.81

---
 gff/BCBio/GFF/GFFParser.py | 43 +++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/gff/BCBio/GFF/GFFParser.py b/gff/BCBio/GFF/GFFParser.py
index 3d4bfa8e..d6b5e97d 100644
--- a/gff/BCBio/GFF/GFFParser.py
+++ b/gff/BCBio/GFF/GFFParser.py
@@ -31,7 +31,15 @@
     import _utils
     collections.defaultdict = _utils.defaultdict
 
-from Bio.Seq import UnknownSeq
+unknown_seq_avail = False
+try:
+    from Bio.Seq import UnknownSeq
+    unknown_seq_avail = True
+except ImportError:
+    # Starting with biopython 1.81, has been removed
+    from Bio.Seq import _UndefinedSequenceData
+    from Bio.Seq import Seq
+
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqFeature
 from Bio import SeqIO
@@ -69,7 +77,7 @@ def _split_keyvals(keyval_str):
 
         GFF3 has key value pairs like:
           count=9;gene=amx-2;sequence=SAGE:aacggagccg
-        GFF2 and GTF have:           
+        GFF2 and GTF have:
           Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
           name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
         """
@@ -170,7 +178,7 @@ def _nest_gff2_features(gff_parts):
         should_do = True
         if params.limit_info:
             for limit_name, limit_values in params.limit_info.items():
-                cur_id = tuple([parts[i] for i in 
+                cur_id = tuple([parts[i] for i in
                     params.filter_info[limit_name]])
                 if cur_id not in limit_values:
                     should_do = False
@@ -286,7 +294,7 @@ class _AbstractMapReduceGFF:
     information.
     """
     def __init__(self, create_missing=True):
-        """Initialize GFF parser 
+        """Initialize GFF parser
 
         create_missing - If True, create blank records for GFF ids not in
         the base_dict. If False, an error will be raised.
@@ -305,7 +313,7 @@ def parse(self, gff_files, base_dict=None, limit_info=None):
         limit_info - A dictionary specifying the regions of the GFF file
         which should be extracted. This allows only relevant portions of a file
         to be parsed.
-        
+
         base_dict - A base dictionary of SeqRecord objects which may be
         pre-populated with sequences and other features. The new features from
         the GFF file will be added to this dictionary.
@@ -536,11 +544,16 @@ def _get_rec(self, base, info_dict):
         if match_id:
             cur_rec = base[match_id]
             # update generated unknown sequences with the expected maximum length
-            if isinstance(cur_rec.seq, UnknownSeq):
+            if unknown_seq_avail and isinstance(cur_rec.seq, UnknownSeq):
                 cur_rec.seq._length = max([max_loc, cur_rec.seq._length])
+            elif not unknown_seq_avail and isinstance(cur_rec.seq._data, _UndefinedSequenceData):
+                cur_rec.seq._data._length = max([max_loc, cur_rec.seq._data._length])
             return cur_rec, base
         elif self._create_missing:
-            new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id'])
+            if unknown_seq_avail:
+                new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id'])
+            else:
+                new_rec = SeqRecord(Seq(None, length=max_loc), info_dict['rec_id'])
             base[info_dict['rec_id']] = new_rec
             return new_rec, base
         else:
@@ -654,7 +667,7 @@ class GFFParser(_AbstractMapReduceGFF):
     def __init__(self, line_adjust_fn=None, create_missing=True):
         _AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
         self._line_adjust_fn = line_adjust_fn
-    
+
     def _gff_process(self, gff_files, limit_info, target_lines):
         """Process GFF addition without any parallelization.
 
@@ -704,7 +717,7 @@ def _lines_to_out_info(self, line_iter, limit_info=None,
                 yield out_info.get_results()
                 out_info = _GFFParserLocalOut((target_lines is not None and
                         target_lines > 1))
-            if (results and results[0][0] == 'directive' and 
+            if (results and results[0][0] == 'directive' and
                     results[0][1] == 'FASTA'):
                 found_seqs = True
                 break
@@ -741,7 +754,7 @@ class DiscoGFFParser(_AbstractMapReduceGFF):
     """
     def __init__(self, disco_host, create_missing=True):
         """Initialize parser.
-        
+
         disco_host - Web reference to a Disco host which will be used for
         parallelizing the GFF reading job.
         """
@@ -755,7 +768,7 @@ def _gff_process(self, gff_files, limit_info, target_lines=None):
         # make these imports local; only need them when using disco
         import simplejson
         import disco
-        # absolute path names unless they are special disco files 
+        # absolute path names unless they are special disco files
         full_files = []
         for f in gff_files:
             if f.split(":")[0] != "disco":
@@ -829,7 +842,7 @@ class GFFExaminer:
     def __init__(self):
         self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
                 gff_source = [1], gff_type = [2])
-    
+
     def _get_local_params(self, limit_info=None):
         class _LocalParams:
             def __init__(self):
@@ -838,13 +851,13 @@ def __init__(self):
         params.limit_info = limit_info
         params.filter_info = self._filter_info
         return params
-    
+
     @_file_or_handle
     def available_limits(self, gff_handle):
         """Return dictionary information on possible limits for this file.
 
         This returns a nested dictionary with the following structure:
-        
+
         keys -- names of items to filter by
         values -- dictionary with:
             keys -- filter choice
@@ -884,7 +897,7 @@ def parent_child_map(self, gff_handle):
 
         keys -- tuple of (source, type) for each parent
         values -- tuple of (source, type) as children of that parent
-        
+
         Not a parallelized map-reduce implementation.
         """
         # collect all of the parent and child types mapped to IDs