From efd0e3f44422da70d35a4922afe7272a66f21bda Mon Sep 17 00:00:00 2001 From: PyYoshi Date: Wed, 29 Mar 2017 02:37:12 +0900 Subject: [PATCH] add UniversalDetector --- src/cchardet/__init__.py | 33 +++++++++++++++++++- src/cchardet/_cchardet.pyx | 63 ++++++++++++++++++++++++++++++++++++++ src/tests/test.py | 31 +++++++++++++++++++ 3 files changed, 126 insertions(+), 1 deletion(-) diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py index 5853777..fa83d73 100644 --- a/src/cchardet/__init__.py +++ b/src/cchardet/__init__.py @@ -15,4 +15,35 @@ def detect(msg): encoding, confidence = _cchardet.detect_with_confidence(msg) if isinstance(encoding, bytes): encoding = encoding.decode() - return { "encoding": encoding, "confidence": confidence } + return {"encoding": encoding, "confidence": confidence} + +class UniversalDetector(object): + def __init__(self): + self._detector = _cchardet.UniversalDetector() + + def __enter__(self): + return self + + def __exit__(self, exception_type, exception_value, traceback): + self.close() + return False + + def reset(self): + self._detector.reset() + + def feed(self, data): + self._detector.feed(data) + + def close(self): + self._detector.close() + + @property + def done(self): + return self._detector.done + + @property + def result(self): + encoding, confidence = self._detector.result + if isinstance(encoding, bytes): + encoding = encoding.decode() + return {"encoding": encoding, "confidence": confidence} diff --git a/src/cchardet/_cchardet.pyx b/src/cchardet/_cchardet.pyx index fe6531e..b1e3123 100644 --- a/src/cchardet/_cchardet.pyx +++ b/src/cchardet/_cchardet.pyx @@ -31,3 +31,66 @@ def detect_with_confidence(const_char_ptr msg): return detected_charset, detected_confidence return None, None + +cdef class UniversalDetector: + cdef uchardet_t _ud + cdef int _done + cdef int _closed + cdef bytes _detected_charset + cdef float _detected_confidence + + def __init__(self): + self._ud = uchardet_new() + self._done = 0 + self._closed = 0 + self._detected_charset = b"" + self._detected_confidence = 0.0 + + def reset(self): + if not self._closed: + self._done = 0 + self._closed = 0 + self._detected_charset = b"" + self._detected_confidence = 0.0 + uchardet_reset(self._ud) + + def feed(self, const_char_ptr msg): + cdef int length + cdef int result + + if self._closed: + return + + length = len(msg) + if length > 0: + result = uchardet_handle_data(self._ud, msg, length) + + if result != 0: + self._closed = 1 + uchardet_delete(self._ud) + raise Exception("Handle data error") + else: + self._done = 1 + uchardet_data_end(self._ud) + self._detected_charset = uchardet_get_charset(self._ud) + self._detected_confidence = uchardet_get_confidence(self._ud) + + def close(self): + if not self._closed: + uchardet_data_end(self._ud) + self._detected_charset = uchardet_get_charset(self._ud) + self._detected_confidence = uchardet_get_confidence(self._ud) + + uchardet_delete(self._ud) + self._closed = 1 + + @property + def done(self): + return bool(self._done) + + @property + def result(self): + if len(self._detected_charset): + return self._detected_charset, self._detected_confidence + else: + return None, None diff --git a/src/tests/test.py b/src/tests/test.py index 78c1bea..f725127 100644 --- a/src/tests/test.py +++ b/src/tests/test.py @@ -46,3 +46,34 @@ class TestCChardet(): testfile ) ) + + # def test_detector(self): + # testfiles = glob.glob('tests/testdata/*/*.txt') + # for testfile in testfiles: + # if testfile.replace("\\", "/") in SKIP_LIST: + # continue + + # base = os.path.basename(testfile) + # expected_charset = os.path.splitext(base)[0] + + # detector = cchardet.UniversalDetector() + # with open(testfile, 'rb') as f: + # msg = f.read() + # detector.feed(msg) + # # line = f.readline() + # # while line: + # # detector.feed(line) + # # if detector.done: + # # break + # # line = f.readline() + # detector.close() + # detected_encoding = detector.result + # eq_( + # expected_charset.lower(), + # detected_encoding['encoding'].lower(), + # 'Expected %s, but got %s for "%s"' % ( + # expected_charset.lower(), + # detected_encoding['encoding'].lower(), + # testfile + # ) + # )