Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 85 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
import os

from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
import sys

cchardet_dir = "src/cchardet/"
uchardet_dir = "src/ext/uchardet/src"
cchardet_sources = glob.glob(cchardet_dir + "*.cpp")
sources = cchardet_sources

uchardet_sources = [
os.path.join(cchardet_dir, "_cchardet.pyx"),
os.path.join(uchardet_dir, "LangModels/LangArabicModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangBelarusianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangBulgarianModel.cpp"),
Expand Down Expand Up @@ -75,6 +78,84 @@
os.path.join(uchardet_dir, "uchardet.cpp"),
]
sources += uchardet_sources
print(sources)

class ccardet_build_ext(build_ext):
user_options = build_ext.user_options + [
("cython-always", None, "run cythonize() even if .c files are present"),
(
"cython-annotate",
None,
"Produce a colorized HTML version of the Cython source.",
),
("cython-directives=", None, "Cythion compiler directives"),
]

def initialize_options(self):
super().initialize_options()
self.cython_always = False
self.cython_annotate = False
self.cython_directives = None

def finalize_options(self):
return super().finalize_options()

def finalize_options(self):
need_cythonize = self.cython_always
cfiles = {}

for extension in self.distribution.ext_modules:
for i, sfile in enumerate(extension.sources):
if sfile.endswith(".pyx"):
prefix, ext = os.path.splitext(sfile)
cfile = prefix + ".c"

if os.path.exists(cfile) and not self.cython_always:
extension.sources[i] = cfile
else:
if os.path.exists(cfile):
cfiles[cfile] = os.path.getmtime(cfile)
else:
cfiles[cfile] = 0
need_cythonize = True

if need_cythonize:

# Double check Cython presence in case setup_requires
# didn't go into effect (most likely because someone
# imported Cython before setup_requires injected the
# correct egg into sys.path.
try:
import Cython
except ImportError:
raise RuntimeError(
"please install cython to compile cchardet from source"
)

from Cython.Build import cythonize

directives = {}
if self.cython_directives:
for directive in self.cython_directives.split(","):
k, _, v = directive.partition("=")
if v.lower() == "false":
v = False
if v.lower() == "true":
v = True
directives[k] = v
self.cython_directives = directives

self.distribution.ext_modules[:] = cythonize(
self.distribution.ext_modules,
compiler_directives=directives,
annotate=self.cython_annotate,
emit_linenums=self.debug,
)

return super().finalize_options()




setup(
package_dir={"": "src"},
Expand All @@ -87,7 +168,10 @@
sources=sources,
include_dirs=[uchardet_dir],
language="c++",
extra_compile_args=['-std=c++11'],
extra_compile_args=['-std=c++11'] if sys.platform != "win32" else [], # Satisfy MSVC Compiler it should default to C++17
)
],
cmdclass={
"build_ext":ccardet_build_ext
}
)
47 changes: 12 additions & 35 deletions src/cchardet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from . import _cchardet

from ._cchardet import UniversalDetector as UniversalDetector, detect_with_confidence as detect_with_confidence
from .typedefs import DecodeResultDict
version = (2, 2, 0, "alpha", 3)
__version__ = "2.2.0a3"


def detect(msg):
def detect(msg: bytes) -> DecodeResultDict:
"""
Args:
msg: str
Expand All @@ -14,8 +14,8 @@ def detect(msg):
"confidence": float
}
"""
encoding, confidence = _cchardet.detect_with_confidence(msg)
if isinstance(encoding, bytes):
encoding, confidence = detect_with_confidence(msg)
if encoding is not None:
encoding = encoding.decode()

if encoding == "MAC-CENTRALEUROPE":
Expand All @@ -24,33 +24,10 @@ def detect(msg):
return {"encoding": encoding, "confidence": confidence}


class UniversalDetector(object):
def __init__(self):
self._detector = _cchardet.UniversalDetector()

def __enter__(self):
return self

def __exit__(self, exception_type, exception_value, traceback):
self.close()
return False

def reset(self):
self._detector.reset()

def feed(self, data):
self._detector.feed(data)

def close(self):
self._detector.close()

@property
def done(self):
return self._detector.done

@property
def result(self):
encoding, confidence = self._detector.result
if isinstance(encoding, bytes):
encoding = encoding.decode()
return {"encoding": encoding, "confidence": confidence}
__all__ = (
"detect",
"detect_with_confidence",
"DecodeResultDict",
"UniversalDetector",
"__version__"
)
72 changes: 72 additions & 0 deletions src/cchardet/_cchardet.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import sys
from typing import Union

from .typedefs import DecodeResultDict

if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self


def detect_with_confidence(msg: bytes) -> Union[tuple[bytes, float], tuple[None, None]]:
"""same as detect but it returns back a tuple[encoding, confidence]

Args:
msg: A give string of bytes to detect

Raises:
MemoryError: If Internal handle could not be allocated
"""
...

class UniversalDetector:
"""Detects character encodings from an input stream"""
def __init__(self) -> None:
...

def reset(self) -> None:
"""
Resets the universal detector allow it to handle a
new stream of data
"""

def feed(self, msg: bytes) -> None:
"""
feeds a stream of characters for detection.

Args:
msg: a steam of data to pass through

Raises:

MemoryError: if memory to memory couldn't
be allocated during handling of the given data
"""

def close(self) -> None:
"""Closes handle and frees internal memory"""

@property
def closed(self) -> bool:
"""Determines if UniversalDetector was closed or not"""

@property
def done(self) -> bool:
"""
Determines if character detection has finished
"""

@property
def result(self) -> DecodeResultDict:
"""
The Result of the given stream,
values will be None if stream is not
considered done or otherwise
"""

def __enter__(self) -> Self:...
def __exit__(self, *args) -> None:...



Loading