Merge branch 'main' into handle-view-number

ttys0dev · Oct 21, 2024 · cc47f10 · cc47f10
2 parents cd1a2d5 + 96acad0
commit cc47f10
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 96 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -16,13 +16,24 @@ Releases are also tagged in git, if that's helpful.
 
 ## Current
 
-**2.6.30 - 2024-10-10**
+**2.6.31 - 2024-10-21**
 
 Fixes:
-  - fix `CADC` oral arguments
+  - `neb` now handles rows with no links
+  - `coloctapp` update cleanup_content
+  - fix `la` xpath selector that was skipping some cases
+
+Features:
+  - new scraper `lactapp_5` for Lousiana Court of Appeals, Fifth Circuit
+  - now sending a `logger.error` call to Sentry when an scraped date is in the future
 
 ## Past
 
+**2.6.30 - 2024-10-10**
+
+Fixes:
+  - fix `CADC` oral arguments
+
 **2.6.29 - 2024-10-10**
 
 Fixes:

diff --git a/juriscraper/lib/utils.py b/juriscraper/lib/utils.py
@@ -1,6 +1,9 @@
 import re
+from datetime import date, datetime
 from itertools import chain, islice, tee
 
+from juriscraper.AbstractSite import logger
+
 from .string_utils import force_unicode
 
 
@@ -51,3 +54,69 @@ def clean_court_object(obj):
         return re.sub(r"\s+,", ",", s)
     else:
         return obj
+
+
+def backscrape_over_paginated_results(
+    url_template: str,
+    first_page: int,
+    last_page: int,
+    start_date: date,
+    end_date: date,
+    date_fmt: str,
+    site,
+) -> list[dict]:
+    """
+    Iterates over consecutive pages, looking for cases in a specific date range
+    Of use when the page offers no date filters, so one must look through all the pages
+    Assumes the page is returning results ordered by date
+
+    :param url_template: string to apply .format() to, like "url&page={}"
+        where the argument to pass will be the page number
+    :param first_page: integer of the first page
+    :param last_page: integer of the last page
+    :param start_date: cases with a date greater than this value will be collected
+    :param end_date: cases with a date lesses than this value will be collected
+    :param date_fmt: date format to parse case dates
+    :param site: the site object
+
+    :return: the list of cases between the dates
+    """
+    cases = []
+
+    if isinstance(start_date, datetime):
+        start_date = start_date.date()
+    if isinstance(end_date, datetime):
+        end_date = end_date.date()
+
+    for page in range(first_page, last_page):
+        site.cases = []  # reset results container
+        site.url = url_template.format(page)
+        site.html = site._download()
+        site._process_html()
+
+        # results are ordered by desceding date
+        earliest = datetime.strptime(site.cases[-1]["date"], date_fmt).date()
+        latest = datetime.strptime(site.cases[0]["date"], date_fmt).date()
+        logger.info("Results page has date range %s to %s", earliest, latest)
+
+        # no intersection between date ranges
+        if max(earliest, start_date) >= min(latest, end_date):
+            # if earliest date from results is earlier than
+            # the start date, no need to iterate any further
+            if earliest < start_date:
+                logger.info(
+                    "Finishing backscrape: earliest results date is %s earlier than start %s",
+                    earliest,
+                    start_date,
+                )
+                break
+            continue
+
+        # if there is an intersection, test every case and
+        # collect the matching cases
+        for case in site.cases:
+            case_date = datetime.strptime(case["date"], date_fmt).date()
+            if case_date < end_date and case_date > start_date:
+                cases.append(case)
+
+    return cases
diff --git a/juriscraper/opinions/united_states/state/lactapp_1.py b/juriscraper/opinions/united_states/state/lactapp_1.py
@@ -6,24 +6,31 @@
   2019-11-24: Created by mmantel
 """
 
-import math
 import re
+from datetime import date, datetime
 
+from juriscraper.AbstractSite import logger
 from juriscraper.lib.html_utils import (
     get_row_column_links,
     get_row_column_text,
 )
+from juriscraper.lib.utils import backscrape_over_paginated_results
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
 class Site(OpinionSiteLinear):
+    first_opinion_date = datetime(2006, 11, 3)
+    # Ensure the backscrape iterable has a single item
+    days_interval = (datetime.today() - first_opinion_date).days + 2
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self._page_size = 50
-        self._base_url = f"https://www.la-fcca.org/opiniongrid/opinionpub.php?opinionpage_size={self._page_size}"
-        self.url = self._base_url
-        self.back_scrape_iterable = self._generate_back_scrape_range()
+        page_size = 50
+        self.base_url = f"https://www.la-fcca.org/opiniongrid/opinionpub.php?opinionpage_size={page_size}"
+        self.url = self.base_url
+        self.make_backscrape_iterable(kwargs)
+        self.is_backscrape = False
 
         # The opinions page does not indicate whether a case is
         # published or unpublished. That is only found in the PDF.
@@ -37,7 +44,7 @@ def _process_html(self):
         for row in self.html.cssselect("#opinion_contentTable tbody tr"):
             self.cases.append(
                 {
-                    "date": get_row_column_text(row, 1),
+                    "date": get_row_column_text(row, 1).replace(" ", ""),
                     "docket": self._parse_docket_numbers(row),
                     "name": get_row_column_text(row, 4),
                     "url": get_row_column_links(row, 3),
@@ -54,25 +61,11 @@ def _parse_docket_numbers(self, row):
         case_numbers = re.findall("[0-9]{4}[A-Z]{2}[0-9]{4}", text)
         return ", ".join(case_numbers)
 
-    def _generate_back_scrape_range(self):
-        # This is a generator function, so this code won't run until a
-        # caller begins iterating, which is necessary because
-        # otherwise this would run during unit tests and trigger an
-        # unwanted network request.
-        last_page = self._get_last_page_number()
-
-        yield from range(1, last_page + 1)
-
-    def _get_last_page_number(self):
-        # The link to the last page has an onclick like:
-        # javascript:opinion_doPostBack('paging','','&opinionsort_field=sortdate&opinionsort_field_by=&opinionsort_field_type=&opinionsort_type=DESC&opinionpage_size=50&opinionp=395')
-        # where 395 is the last page number.
-        html = self._get_html_tree_by_url(self._base_url, {})
-        el = html.cssselect("a[title=last]")[0]
-        onclick = el.get("onclick")
-        return int(re.findall(r"\d+", onclick)[-1])
-
-    def _download_backwards(self, page):
-        self.url = self._base_url + ("&opinionp=%d" % page)
-        self.html = self._download()
-        self._process_html()
+    def _download_backwards(self, dates: tuple[date]) -> None:
+        logger.info("Backscraping for range %s %s", *dates)
+        url_template = f"{self.base_url}&opinionp={{}}"
+        start, end = dates
+        last_page = 500  # Real last page is 467 in Oct, 2024
+        self.cases = backscrape_over_paginated_results(
+            url_template, 2, last_page, start, end, "%m/%d/%Y", self
+        )
diff --git a/juriscraper/opinions/united_states/state/nd.py b/juriscraper/opinions/united_states/state/nd.py
@@ -9,6 +9,7 @@
 
 from juriscraper.AbstractSite import logger
 from juriscraper.lib.string_utils import normalize_dashes
+from juriscraper.lib.utils import backscrape_over_paginated_results
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
@@ -21,7 +22,9 @@ class Site(OpinionSiteLinear):
         "nature_of_suit",
         "judge",
     ]
-    first_opinion_date = datetime(1955, 10, 25).date()
+    first_opinion_date = datetime(1955, 10, 25)
+    # Ensure the backscrape iterable has a single item
+    days_interval = (datetime.today() - first_opinion_date).days + 2
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -158,69 +161,12 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
         :param dates: (start_date, end_date) tuple
         :return None
         """
+        logger.info("Backscraping for range %s %s", *dates)
         start, end = dates
-        date_fmt = "%m/%d/%Y"
         # last page is 118 (August 2024)
-        first_page, last_page = 2, 130
+        last_page = 130
         base_url = self.url
-        cases = []
-
-        for page in range(first_page, last_page):
-            self.cases = []  # reset results container
-            self.url = f"{base_url}&page={page}"
-            self.html = self._download()
-            self._process_html()
-
-            # results are ordered by desceding date
-            earliest = datetime.strptime(
-                self.cases[-1]["date"], date_fmt
-            ).date()
-            latest = datetime.strptime(self.cases[0]["date"], date_fmt).date()
-            logger.info(
-                "Results page has date range %s to %s", earliest, latest
-            )
-
-            # no intersection between date ranges
-            if max(earliest, start) >= min(latest, end):
-                # if earliest date from results is earlier than
-                # the start date, no need to iterate any further
-                if earliest < start:
-                    logger.info(
-                        "Finishing backscrape: earliest results date is %s earlier than start %s",
-                        earliest,
-                        start,
-                    )
-                    break
-                continue
-
-            # if there is an intersection, test every case and
-            # collect the matching cases
-            for case in self.cases:
-                case_date = datetime.strptime(case["date"], date_fmt).date()
-                if case_date < end and case_date > start:
-                    cases.append(case)
-
-        self.cases = cases
-
-    def make_backscrape_iterable(self, kwargs: dict) -> None:
-        """Checks if backscrape start and end arguments have been passed
-        by caller, and parses them accordingly
-
-        :param kwargs: passed when initializing the scraper, may or
-            may not contain backscrape controlling arguments
-        :return None
-        """
-        start = kwargs.get("backscrape_start")
-        end = kwargs.get("backscrape_end")
-
-        if start:
-            start = datetime.strptime(start, "%m/%d/%Y").date()
-        else:
-            start = self.first_opinion_date
-        if end:
-            end = datetime.strptime(end, "%m/%d/%Y").date()
-        else:
-            end = datetime.now().date()
-
-        logger.info("Backscraping for cases between %s and %s", start, end)
-        self.back_scrape_iterable = [(start, end)]
+        url_template = f"{base_url}&page={{}}"
+        self.cases = backscrape_over_paginated_results(
+            url_template, 2, last_page, start, end, "%m/%d/%Y", self
+        )
diff --git a/juriscraper/opinions/united_states/state/neb.py b/juriscraper/opinions/united_states/state/neb.py
@@ -1,3 +1,4 @@
+from juriscraper.AbstractSite import logger
 from juriscraper.lib.html_utils import fix_links_in_lxml_tree
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
@@ -50,8 +51,10 @@ def _process_html(self):
             for row in table.xpath(".//tr[td]"):
                 c1, c2, c3 = row.xpath(".//td")
                 docket = c1.xpath(".//text()")[0].strip()
-                if "A-XX-XXXX" in docket:
+                if "A-XX-XXXX" in docket or not c3.xpath(".//a"):
+                    logger.info("Skip row %s", row.text_content())
                     continue
+
                 citation = c2.xpath(".//text()")[0].strip()
                 name = c3.xpath(".//a/text()")[0].strip()
                 url = c3.xpath(".//a")[0].get("href")

diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import find_packages, setup
 from setuptools.command.install import install
 
-VERSION = "2.6.30"
+VERSION = "2.6.31"
 AUTHOR = "Free Law Project"
 EMAIL = "info@free.law"
 HERE = os.path.abspath(os.path.dirname(__file__))