freelawproject · Luis-manzur · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -23,6 +23,7 @@ Features:
 Changes:
 - Refactor `ACMSDocketReport` to handle missing "date entered" values gracefully
   and expands the use of raw data fields for reliable date information. #1459
+- Improve `nd` scraper to extract citations from the html.
 
 Fixes:
 - Improve `ny` cleanup_content to remove email protection that was causing

diff --git a/juriscraper/opinions/united_states/state/nd.py b/juriscraper/opinions/united_states/state/nd.py
@@ -1,7 +1,10 @@
 # Author: Phil Ardery
 # Contact: https://www.ndcourts.gov/contact-us
+# History:
 # Date created: 2019-02-28
 # Updated: 2024-05-08, grossir: to OpinionSiteLinear and new URL
+# Updated: 2025-07-02, luism: get citation from HTML
+
 import re
 from datetime import date, datetime
 from urllib.parse import urljoin
@@ -15,6 +18,7 @@
 class Site(OpinionSiteLinear):
     base_url = "https://www.ndcourts.gov/"
     ordered_fields = [
+        "citation",
         "name",
         "docket",
         "date",
@@ -50,11 +54,30 @@ def _process_html(self) -> None:
 
             for idx, txt in enumerate(raw_values[:5]):
                 if idx == 0:
-                    txt, _ = self.clean_name(txt)
+                    # Separate case name and citation if present
+                    match = re.match(
+                        r"^(.*?)(\s*((\d{4}\sND\s\d+)|(1 \d\.N\.W\d d\d+)))?\s*$",
+                        txt,
+                    )
+                    if match:
+                        case_name = match.group(1).strip()
+                        # If matched with the second regex (1 \d\.N\.W\d d\d+), set citation to ""
+                        if match.group(5):
+                            citation = ""
+                        else:
+                            citation = (
+                                match.group(2).strip()
+                                if match.group(2)
+                                else ""
+                            )
+                        txt = case_name
+                    else:
+                        citation = ""
+                    txt = self.clean_name(txt)
+                    values.append(citation)
                 else:
                     txt = txt.split(":", 1)[1].strip()
                 values.append(txt)
-
             summary = (
                 " ".join(raw_values[5:]).strip() if len(raw_values) > 5 else ""
             )
@@ -72,7 +95,7 @@ def _process_html(self) -> None:
                 continue
             seen_urls.add(url)
 
-            case = dict(zip(self.ordered_fields, values[:5]))
+            case = dict(zip(self.ordered_fields, values[:6]))
             case.update({"summary": summary, "url": url, "per_curiam": False})
 
             if "per curiam" in case["judge"].lower():
@@ -81,7 +104,7 @@ def _process_html(self) -> None:
 
             self.cases.append(case)
 
-    def clean_name(self, name: str) -> tuple[str, str]:
+    def clean_name(self, name: str) -> str:
         """Cleans case name
 
         Some case names list the consolidated docket or a
@@ -90,14 +113,12 @@ def clean_name(self, name: str) -> tuple[str, str]:
         :param name: raw case name
         :return: cleaned name and extra_docket numbers
         """
-        other_dockets = ""
         if "(consolidated w/" in name:
-            other_dockets = ",".join(re.findall(r"\d{8}", name))
             name = name.split("(consolidated w/")[0]
         if "(CONFIDENTIAL" in name:
             name = name.split("(CONFIDENTIAL")[0]
 
-        return name.strip(), other_dockets
+        return name.strip()
 
     def extract_from_text(self, scraped_text: str) -> dict:
         """Extract model fields from opinion's document text