Partially-feed interface

This commit is contained in:
Denis Matiychuk 2016-10-16 23:31:47 +03:00
parent 2ac7e55d1f
commit fe77753362
3 changed files with 145 additions and 0 deletions

View file

@ -17,3 +17,27 @@ def detect(msg):
if isinstance(encoding, bytes): if isinstance(encoding, bytes):
encoding = encoding.decode() encoding = encoding.decode()
return {"encoding": encoding, "confidence": confidence} return {"encoding": encoding, "confidence": confidence}
class Detector(object):
"""Wrap csd_consider with 'feed' feature."""
def __init__(self):
self._detector = _cchardet.Detector()
def feed(self, data):
self._detector.feed(data)
def close(self):
self._detector.close()
@property
def done(self):
return self._detector.done
@property
def result(self):
encoding, confidence = self._detector.result
if isinstance(encoding, bytes):
encoding = encoding.decode()
return {"encoding": encoding, "confidence": confidence}

View file

@ -50,3 +50,51 @@ def detect_with_confidence(char *msg):
return detected_charset, confidence return detected_charset, confidence
else: else:
return None, None return None, None
cdef class Detector:
cdef csd_t csd
cdef int _done
cdef int _closed
cdef float _confidence
cdef const_char_ptr _detected_charset
def __init__(self):
self.csd = csd_open()
self._done = 0
self._closed = 0
self._confidence = 0.0
self._detected_charset = ''
def feed(self, char *msg):
cdef int length
cdef int result
if not self.done and not self._closed:
length = strlen(msg)
result = csd_consider(self.csd, msg, length)
if result == -1: # Error, signal with a negative number
raise Exception("Error, signal with a negative number")
elif result == 1: # Need more data
pass
elif result == 0: # Detected early
self._done = 1
self.close()
def close(self):
if not self._closed:
self._detected_charset = csd_close2(self.csd, &self._confidence)
self._closed = 1
@property
def done(self):
return bool(self._done)
@property
def result(self):
if len(self._detected_charset):
return self._detected_charset, self._confidence
else:
return None, None

View file

@ -492,3 +492,76 @@ class TestCchardetSpeed():
cchardet.detect(msg) cchardet.detect(msg)
result_cchardet += (time.time() - start_cchardet) result_cchardet += (time.time() - start_cchardet)
print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s') print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
class TestDetector():
encodings_map = {
r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt": "ISO-8859-5",
r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt": "UTF-8",
r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt": "WINDOWS-1251",
r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt": "ISO-8859-2",
r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt": "UTF-8",
r"testdata/de/UTF-8/wikitop_de_UTF-8.txt": "UTF-8",
r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt": "UTF-8",
r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt": "ISO-8859-7",
r"testdata/el/UTF-8/wikitop_el_UTF-8.txt": "UTF-8",
r"testdata/en/UTF-8/wikitop_en_UTF-8.txt": "UTF-8",
r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/es/UTF-8/wikitop_es_UTF-8.txt": "UTF-8",
r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt": "UTF-8",
r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt": "UTF-8",
r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/he/UTF-8/wikitop_he_UTF-8.txt": "UTF-8",
r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt": "WINDOWS-1255",
r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt": "UTF-8",
r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt": "ISO-8859-2",
r"testdata/it/UTF-8/wikitop_it_UTF-8.txt": "UTF-8",
r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt": "UTF-8",
r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/no/UTF-8/wikitop_no_UTF-8.txt": "UTF-8",
r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt": "UTF-8",
r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt": "ISO-8859-2",
r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt": "UTF-8",
r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt": "UTF-8",
r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt": "WINDOWS-1251",
r"testdata/ru/IBM855/wikitop_ru_IBM855.txt": "IBM855",
r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt": "KOI8-R",
r"testdata/ru/X-MAC-CYRILLIC/wikitop_ru_MACCYRILLIC.txt": "MAC-CYRILLIC",
r"testdata/se/UTF-8/wikitop_se_UTF-8.txt": "UTF-8",
r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt": "WINDOWS-1252",
r"testdata/th/UTF-8/wikitop_th_UTF-8.txt": "UTF-8",
r"testdata/th/TIS-620/utffool_th_TIS-620.txt": "TIS-620",
r"testdata/th/TIS-620/wikitop_th_TIS-620.txt": "TIS-620",
r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt": "UTF-8",
r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt": "ISO-8859-9",
r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt": "UTF-8",
r"testdata/zh/GB18030/wikitop_zh_GB18030.txt": "GB18030",
}
def test_detector(self):
for path, encoding in self.encodings_map.items():
detector = cchardet.Detector()
with open(path, 'rb') as f:
line = f.readline()
while line:
detector.feed(line)
if detector.done:
break
line = f.readline()
detector.close()
detected_encoding = detector.result
eq_(encoding.lower(), detected_encoding['encoding'].lower())
def test_detector_noresult(self):
detector = cchardet.Detector()
detector.feed('0')
eq_(detector.done, False)
eq_(detector.result, {"encoding": None, "confidence": None})