diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py index de2a3db..c70bb84 100644 --- a/src/cchardet/__init__.py +++ b/src/cchardet/__init__.py @@ -17,3 +17,27 @@ def detect(msg): if isinstance(encoding, bytes): encoding = encoding.decode() return {"encoding": encoding, "confidence": confidence} + + +class Detector(object): + """Wrap csd_consider with 'feed' feature.""" + + def __init__(self): + self._detector = _cchardet.Detector() + + def feed(self, data): + self._detector.feed(data) + + def close(self): + self._detector.close() + + @property + def done(self): + return self._detector.done + + @property + def result(self): + encoding, confidence = self._detector.result + if isinstance(encoding, bytes): + encoding = encoding.decode() + return {"encoding": encoding, "confidence": confidence} diff --git a/src/cchardet/_cchardet.pyx b/src/cchardet/_cchardet.pyx index 3dfc995..c17ddff 100644 --- a/src/cchardet/_cchardet.pyx +++ b/src/cchardet/_cchardet.pyx @@ -50,3 +50,51 @@ def detect_with_confidence(char *msg): return detected_charset, confidence else: return None, None + +cdef class Detector: + cdef csd_t csd + cdef int _done + cdef int _closed + cdef float _confidence + cdef const_char_ptr _detected_charset + + def __init__(self): + self.csd = csd_open() + self._done = 0 + self._closed = 0 + self._confidence = 0.0 + self._detected_charset = '' + + def feed(self, char *msg): + cdef int length + cdef int result + + if not self.done and not self._closed: + length = strlen(msg) + result = csd_consider(self.csd, msg, length) + + if result == -1: # Error, signal with a negative number + raise Exception("Error, signal with a negative number") + + elif result == 1: # Need more data + pass + + elif result == 0: # Detected early + self._done = 1 + self.close() + + def close(self): + if not self._closed: + self._detected_charset = csd_close2(self.csd, &self._confidence) + self._closed = 1 + + @property + def done(self): + return bool(self._done) + + @property + def result(self): + if len(self._detected_charset): + return self._detected_charset, self._confidence + else: + return None, None diff --git a/test/tests.py b/test/tests.py index 14445a3..d813bab 100644 --- a/test/tests.py +++ b/test/tests.py @@ -492,3 +492,76 @@ class TestCchardetSpeed(): cchardet.detect(msg) result_cchardet += (time.time() - start_cchardet) print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s') + + +class TestDetector(): + + encodings_map = { + r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt": "ISO-8859-5", + r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt": "UTF-8", + r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt": "WINDOWS-1251", + r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt": "ISO-8859-2", + r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt": "UTF-8", + r"testdata/de/UTF-8/wikitop_de_UTF-8.txt": "UTF-8", + r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt": "UTF-8", + r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt": "ISO-8859-7", + r"testdata/el/UTF-8/wikitop_el_UTF-8.txt": "UTF-8", + r"testdata/en/UTF-8/wikitop_en_UTF-8.txt": "UTF-8", + r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/es/UTF-8/wikitop_es_UTF-8.txt": "UTF-8", + r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt": "UTF-8", + r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt": "UTF-8", + r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/he/UTF-8/wikitop_he_UTF-8.txt": "UTF-8", + r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt": "WINDOWS-1255", + r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt": "UTF-8", + r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt": "ISO-8859-2", + r"testdata/it/UTF-8/wikitop_it_UTF-8.txt": "UTF-8", + r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt": "UTF-8", + r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/no/UTF-8/wikitop_no_UTF-8.txt": "UTF-8", + r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt": "UTF-8", + r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt": "ISO-8859-2", + r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt": "UTF-8", + r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt": "UTF-8", + r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt": "WINDOWS-1251", + r"testdata/ru/IBM855/wikitop_ru_IBM855.txt": "IBM855", + r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt": "KOI8-R", + r"testdata/ru/X-MAC-CYRILLIC/wikitop_ru_MACCYRILLIC.txt": "MAC-CYRILLIC", + r"testdata/se/UTF-8/wikitop_se_UTF-8.txt": "UTF-8", + r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt": "WINDOWS-1252", + r"testdata/th/UTF-8/wikitop_th_UTF-8.txt": "UTF-8", + r"testdata/th/TIS-620/utffool_th_TIS-620.txt": "TIS-620", + r"testdata/th/TIS-620/wikitop_th_TIS-620.txt": "TIS-620", + r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt": "UTF-8", + r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt": "ISO-8859-9", + r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt": "UTF-8", + r"testdata/zh/GB18030/wikitop_zh_GB18030.txt": "GB18030", + } + + def test_detector(self): + for path, encoding in self.encodings_map.items(): + detector = cchardet.Detector() + with open(path, 'rb') as f: + line = f.readline() + while line: + detector.feed(line) + if detector.done: + break + line = f.readline() + detector.close() + detected_encoding = detector.result + eq_(encoding.lower(), detected_encoding['encoding'].lower()) + + def test_detector_noresult(self): + detector = cchardet.Detector() + detector.feed('0') + eq_(detector.done, False) + eq_(detector.result, {"encoding": None, "confidence": None})