Skip to content
34 changes: 19 additions & 15 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,12 @@ def init_chardet(self) -> None:

self.encdetector = UniversalDetector()

def open(self, filename: str) -> Tuple[List[str], str]:
def open(self, filename: str) -> Tuple[List[str], Optional[str]]:
if self.use_chardet:
return self.open_with_chardet(filename)
return self.open_with_internal(filename)

def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]:
self.encdetector.reset()
with open(filename, "rb") as fb:
for line in fb:
Expand All @@ -241,26 +241,30 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
break
self.encdetector.close()
encoding = self.encdetector.result["encoding"]

if not encoding:
print(
f"WARNING: Chardet failed to detect encoding for file {filename}.",
file=sys.stderr,
)
try:
f = open(filename, encoding=encoding, newline="")
with open(filename, encoding=encoding, newline="") as f:
lines = self.get_lines(f)
except UnicodeDecodeError:
print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr)
error_msg = (
f"Failed to decode file {filename} using detected "
f"encoding {encoding}."
)
print(error_msg, file=sys.stderr)
raise
except LookupError:
print(
f"ERROR: Don't know how to handle encoding {encoding}: {filename}",
file=sys.stderr,
)
error_msg = f"Unknown encoding {encoding} detected for file {filename}."
print(error_msg, file=sys.stderr)
raise
else:
lines = self.get_lines(f)
f.close()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To minimize changes, I would suggest:

        try:
            f = open(filename, encoding=encoding, newline="")
        except LookupError:
            print(
                f"ERROR: Don't know how to handle encoding {encoding}: {filename}",
                file=sys.stderr,
            )
            raise
        else:
            try:
                lines = f.readlines()
            except UnicodeDecodeError:
                print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr)
                raise
            finally:
                f.close()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to minimize changes? I'm happy to go with whatever you want but let me try to convince you (with Zen of Python):

  • "There should be an obvious way to do it": context managers are the way one should open files. Not use finally, it's messy
  • "Flat is better than nested": Your suggestion has loads of try/else/try/finally nesting, it's hard to grok

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you really think this is not much more readable?

        try:
            with open(filename, encoding=encoding, newline="") as f:
                lines = self.get_lines(f)
        except LookupError:  # Raised by open() if encoding is unknown
            error_msg = f"ERROR: Chardet returned unknown encoding for: {filename}."
            print(error_msg, file=sys.stderr)
            raise
        except UnicodeDecodeError:  # Raised by self.get_lines() if decoding fails
            error_msg = f"ERROR: Failed decoding file: {filename}"
            print(error_msg, file=sys.stderr)
            raise

Also note that you introduced a bug by replacing self.get_lines(f) with f.readlines() in your suggestion 🙈

Copy link
Collaborator

@DimitriPapadopoulos DimitriPapadopoulos Aug 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. I agree that in general flat is better, but I would also like to limit the code under try to the strict minimum, as a way to document which exceptions each piece of code is expected to raise. Some linters do enforce that.
    • open only raises LookupError
    • readlines only raises UnicodeDecodeError
  2. I'd like to keep the codebase consistent. See Fix uncaught exception on empty files #2195.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do agree with using context managers to open files. I just didn't know how to make it compatible with try.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, nice to keep try local, but it's a tradeoff with nesting. I commented instead to make clear what raises what.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you be more specific regarding your "I would like to keep codebase consistent"? I don't know what exactly you would like me to change. Is anything inconsistent?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

open_with_chardet and open_with_internal (already fixed in #2195) should be kept as similar as possible and even share code if possible at all.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the clarification!


return lines, f.encoding
return lines, encoding

def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
encoding = None
encoding: str
first_try = True
for encoding in ("utf-8", "iso-8859-1"):
if first_try:
Expand Down Expand Up @@ -887,10 +891,10 @@ def parse_file(
bad_count = 0
lines = None
changed = False
encoding: str | None = "utf-8"

if filename == "-":
f = sys.stdin
encoding = "utf-8"
lines = f.readlines()
else:
if options.check_filenames:
Expand Down