-
-
Notifications
You must be signed in to change notification settings - Fork 133
1301 nd neutral citation in the html #1477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
cf246c8
ebe178a
da04fd1
4fb7e64
9331199
a405db9
332b73b
cce7716
60d8472
29559b0
2b3e4fe
e54957e
30b2a71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
# Author: Phil Ardery | ||
# Contact: https://www.ndcourts.gov/contact-us | ||
# History: | ||
# Date created: 2019-02-28 | ||
# Updated: 2024-05-08, grossir: to OpinionSiteLinear and new URL | ||
# Updated: 2025-07-02, luism: get citation from HTML | ||
|
||
import re | ||
from datetime import date, datetime | ||
from urllib.parse import urljoin | ||
|
@@ -15,6 +18,7 @@ | |
class Site(OpinionSiteLinear): | ||
base_url = "https://www.ndcourts.gov/" | ||
ordered_fields = [ | ||
"citation", | ||
"name", | ||
"docket", | ||
"date", | ||
|
@@ -51,11 +55,22 @@ def _process_html(self) -> None: | |
|
||
for idx, txt in enumerate(raw_values[:5]): | ||
if idx == 0: | ||
txt, _ = self.clean_name(txt) | ||
# Separate case name and citation if present | ||
match = re.match( | ||
r"^(.*?)(\s*(\d{4}\sND\s\d+))\s*$", | ||
txt, | ||
) | ||
if match: | ||
case_name = match.group(1).strip() | ||
citation = match.group(2).strip() | ||
txt = case_name | ||
else: | ||
citation = "" | ||
txt, other_docket = self.clean_name(txt) | ||
values.append(citation) | ||
else: | ||
txt = txt.split(":", 1)[1].strip() | ||
values.append(txt) | ||
|
||
summary = ( | ||
" ".join(raw_values[5:]).strip() if len(raw_values) > 5 else "" | ||
) | ||
|
@@ -73,13 +88,16 @@ def _process_html(self) -> None: | |
continue | ||
seen_urls.add(url) | ||
|
||
case = dict(zip(self.ordered_fields, values[:5])) | ||
case = dict(zip(self.ordered_fields, values[:6])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is the culprit, you assume 6 items in the list but you only have 5 in one path There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have 5 raw values, but we added citation so we need first 6 values |
||
case.update({"summary": summary, "url": url, "per_curiam": False}) | ||
|
||
if "per curiam" in case["judge"].lower(): | ||
case["judge"] = "" | ||
case["per_curiam"] = True | ||
|
||
if other_docket: | ||
case["docket"] = f"{case['docket']}, {other_docket}" | ||
|
||
self.cases.append(case) | ||
|
||
def clean_name(self, name: str) -> tuple[str, str]: | ||
|
Uh oh!
There was an error while loading. Please reload this page.