commit
fba4c216ee
3 changed files with 145 additions and 0 deletions
|
@ -17,3 +17,27 @@ def detect(msg):
|
||||||
if isinstance(encoding, bytes):
|
if isinstance(encoding, bytes):
|
||||||
encoding = encoding.decode()
|
encoding = encoding.decode()
|
||||||
return {"encoding": encoding, "confidence": confidence}
|
return {"encoding": encoding, "confidence": confidence}
|
||||||
|
|
||||||
|
|
||||||
|
class Detector(object):
|
||||||
|
"""Wrap csd_consider with 'feed' feature."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._detector = _cchardet.Detector()
|
||||||
|
|
||||||
|
def feed(self, data):
|
||||||
|
self._detector.feed(data)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self._detector.close()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def done(self):
|
||||||
|
return self._detector.done
|
||||||
|
|
||||||
|
@property
|
||||||
|
def result(self):
|
||||||
|
encoding, confidence = self._detector.result
|
||||||
|
if isinstance(encoding, bytes):
|
||||||
|
encoding = encoding.decode()
|
||||||
|
return {"encoding": encoding, "confidence": confidence}
|
||||||
|
|
|
@ -50,3 +50,51 @@ def detect_with_confidence(char *msg):
|
||||||
return detected_charset, confidence
|
return detected_charset, confidence
|
||||||
else:
|
else:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
cdef class Detector:
|
||||||
|
cdef csd_t csd
|
||||||
|
cdef int _done
|
||||||
|
cdef int _closed
|
||||||
|
cdef float _confidence
|
||||||
|
cdef const_char_ptr _detected_charset
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.csd = csd_open()
|
||||||
|
self._done = 0
|
||||||
|
self._closed = 0
|
||||||
|
self._confidence = 0.0
|
||||||
|
self._detected_charset = ''
|
||||||
|
|
||||||
|
def feed(self, char *msg):
|
||||||
|
cdef int length
|
||||||
|
cdef int result
|
||||||
|
|
||||||
|
if not self.done and not self._closed:
|
||||||
|
length = strlen(msg)
|
||||||
|
result = csd_consider(self.csd, msg, length)
|
||||||
|
|
||||||
|
if result == -1: # Error, signal with a negative number
|
||||||
|
raise Exception("Error, signal with a negative number")
|
||||||
|
|
||||||
|
elif result == 1: # Need more data
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif result == 0: # Detected early
|
||||||
|
self._done = 1
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
if not self._closed:
|
||||||
|
self._detected_charset = csd_close2(self.csd, &self._confidence)
|
||||||
|
self._closed = 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def done(self):
|
||||||
|
return bool(self._done)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def result(self):
|
||||||
|
if len(self._detected_charset):
|
||||||
|
return self._detected_charset, self._confidence
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
|
@ -492,3 +492,76 @@ class TestCchardetSpeed():
|
||||||
cchardet.detect(msg)
|
cchardet.detect(msg)
|
||||||
result_cchardet += (time.time() - start_cchardet)
|
result_cchardet += (time.time() - start_cchardet)
|
||||||
print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
|
print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetector():
|
||||||
|
|
||||||
|
encodings_map = {
|
||||||
|
r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt": "ISO-8859-5",
|
||||||
|
r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt": "WINDOWS-1251",
|
||||||
|
r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt": "ISO-8859-2",
|
||||||
|
r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/de/UTF-8/wikitop_de_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt": "ISO-8859-7",
|
||||||
|
r"testdata/el/UTF-8/wikitop_el_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/en/UTF-8/wikitop_en_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/es/UTF-8/wikitop_es_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/he/UTF-8/wikitop_he_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt": "WINDOWS-1255",
|
||||||
|
r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt": "ISO-8859-2",
|
||||||
|
r"testdata/it/UTF-8/wikitop_it_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/no/UTF-8/wikitop_no_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt": "ISO-8859-2",
|
||||||
|
r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt": "WINDOWS-1251",
|
||||||
|
r"testdata/ru/IBM855/wikitop_ru_IBM855.txt": "IBM855",
|
||||||
|
r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt": "KOI8-R",
|
||||||
|
r"testdata/ru/X-MAC-CYRILLIC/wikitop_ru_MACCYRILLIC.txt": "MAC-CYRILLIC",
|
||||||
|
r"testdata/se/UTF-8/wikitop_se_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||||
|
r"testdata/th/UTF-8/wikitop_th_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/th/TIS-620/utffool_th_TIS-620.txt": "TIS-620",
|
||||||
|
r"testdata/th/TIS-620/wikitop_th_TIS-620.txt": "TIS-620",
|
||||||
|
r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt": "ISO-8859-9",
|
||||||
|
r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt": "UTF-8",
|
||||||
|
r"testdata/zh/GB18030/wikitop_zh_GB18030.txt": "GB18030",
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_detector(self):
|
||||||
|
for path, encoding in self.encodings_map.items():
|
||||||
|
detector = cchardet.Detector()
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
line = f.readline()
|
||||||
|
while line:
|
||||||
|
detector.feed(line)
|
||||||
|
if detector.done:
|
||||||
|
break
|
||||||
|
line = f.readline()
|
||||||
|
detector.close()
|
||||||
|
detected_encoding = detector.result
|
||||||
|
eq_(encoding.lower(), detected_encoding['encoding'].lower())
|
||||||
|
|
||||||
|
def test_detector_noresult(self):
|
||||||
|
detector = cchardet.Detector()
|
||||||
|
detector.feed('0')
|
||||||
|
eq_(detector.done, False)
|
||||||
|
eq_(detector.result, {"encoding": None, "confidence": None})
|
||||||
|
|
Loading…
Reference in a new issue