commit
fba4c216ee
3 changed files with 145 additions and 0 deletions
|
@ -17,3 +17,27 @@ def detect(msg):
|
|||
if isinstance(encoding, bytes):
|
||||
encoding = encoding.decode()
|
||||
return {"encoding": encoding, "confidence": confidence}
|
||||
|
||||
|
||||
class Detector(object):
|
||||
"""Wrap csd_consider with 'feed' feature."""
|
||||
|
||||
def __init__(self):
|
||||
self._detector = _cchardet.Detector()
|
||||
|
||||
def feed(self, data):
|
||||
self._detector.feed(data)
|
||||
|
||||
def close(self):
|
||||
self._detector.close()
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
return self._detector.done
|
||||
|
||||
@property
|
||||
def result(self):
|
||||
encoding, confidence = self._detector.result
|
||||
if isinstance(encoding, bytes):
|
||||
encoding = encoding.decode()
|
||||
return {"encoding": encoding, "confidence": confidence}
|
||||
|
|
|
@ -50,3 +50,51 @@ def detect_with_confidence(char *msg):
|
|||
return detected_charset, confidence
|
||||
else:
|
||||
return None, None
|
||||
|
||||
cdef class Detector:
|
||||
cdef csd_t csd
|
||||
cdef int _done
|
||||
cdef int _closed
|
||||
cdef float _confidence
|
||||
cdef const_char_ptr _detected_charset
|
||||
|
||||
def __init__(self):
|
||||
self.csd = csd_open()
|
||||
self._done = 0
|
||||
self._closed = 0
|
||||
self._confidence = 0.0
|
||||
self._detected_charset = ''
|
||||
|
||||
def feed(self, char *msg):
|
||||
cdef int length
|
||||
cdef int result
|
||||
|
||||
if not self.done and not self._closed:
|
||||
length = strlen(msg)
|
||||
result = csd_consider(self.csd, msg, length)
|
||||
|
||||
if result == -1: # Error, signal with a negative number
|
||||
raise Exception("Error, signal with a negative number")
|
||||
|
||||
elif result == 1: # Need more data
|
||||
pass
|
||||
|
||||
elif result == 0: # Detected early
|
||||
self._done = 1
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
if not self._closed:
|
||||
self._detected_charset = csd_close2(self.csd, &self._confidence)
|
||||
self._closed = 1
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
return bool(self._done)
|
||||
|
||||
@property
|
||||
def result(self):
|
||||
if len(self._detected_charset):
|
||||
return self._detected_charset, self._confidence
|
||||
else:
|
||||
return None, None
|
||||
|
|
|
@ -492,3 +492,76 @@ class TestCchardetSpeed():
|
|||
cchardet.detect(msg)
|
||||
result_cchardet += (time.time() - start_cchardet)
|
||||
print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
|
||||
|
||||
|
||||
class TestDetector():
|
||||
|
||||
encodings_map = {
|
||||
r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt": "ISO-8859-5",
|
||||
r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt": "UTF-8",
|
||||
r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt": "WINDOWS-1251",
|
||||
r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt": "ISO-8859-2",
|
||||
r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt": "UTF-8",
|
||||
r"testdata/de/UTF-8/wikitop_de_UTF-8.txt": "UTF-8",
|
||||
r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt": "UTF-8",
|
||||
r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt": "ISO-8859-7",
|
||||
r"testdata/el/UTF-8/wikitop_el_UTF-8.txt": "UTF-8",
|
||||
r"testdata/en/UTF-8/wikitop_en_UTF-8.txt": "UTF-8",
|
||||
r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/es/UTF-8/wikitop_es_UTF-8.txt": "UTF-8",
|
||||
r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt": "UTF-8",
|
||||
r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt": "UTF-8",
|
||||
r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/he/UTF-8/wikitop_he_UTF-8.txt": "UTF-8",
|
||||
r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt": "WINDOWS-1255",
|
||||
r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt": "UTF-8",
|
||||
r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt": "ISO-8859-2",
|
||||
r"testdata/it/UTF-8/wikitop_it_UTF-8.txt": "UTF-8",
|
||||
r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt": "UTF-8",
|
||||
r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/no/UTF-8/wikitop_no_UTF-8.txt": "UTF-8",
|
||||
r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt": "UTF-8",
|
||||
r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt": "ISO-8859-2",
|
||||
r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt": "UTF-8",
|
||||
r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt": "UTF-8",
|
||||
r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt": "WINDOWS-1251",
|
||||
r"testdata/ru/IBM855/wikitop_ru_IBM855.txt": "IBM855",
|
||||
r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt": "KOI8-R",
|
||||
r"testdata/ru/X-MAC-CYRILLIC/wikitop_ru_MACCYRILLIC.txt": "MAC-CYRILLIC",
|
||||
r"testdata/se/UTF-8/wikitop_se_UTF-8.txt": "UTF-8",
|
||||
r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt": "WINDOWS-1252",
|
||||
r"testdata/th/UTF-8/wikitop_th_UTF-8.txt": "UTF-8",
|
||||
r"testdata/th/TIS-620/utffool_th_TIS-620.txt": "TIS-620",
|
||||
r"testdata/th/TIS-620/wikitop_th_TIS-620.txt": "TIS-620",
|
||||
r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt": "UTF-8",
|
||||
r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt": "ISO-8859-9",
|
||||
r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt": "UTF-8",
|
||||
r"testdata/zh/GB18030/wikitop_zh_GB18030.txt": "GB18030",
|
||||
}
|
||||
|
||||
def test_detector(self):
|
||||
for path, encoding in self.encodings_map.items():
|
||||
detector = cchardet.Detector()
|
||||
with open(path, 'rb') as f:
|
||||
line = f.readline()
|
||||
while line:
|
||||
detector.feed(line)
|
||||
if detector.done:
|
||||
break
|
||||
line = f.readline()
|
||||
detector.close()
|
||||
detected_encoding = detector.result
|
||||
eq_(encoding.lower(), detected_encoding['encoding'].lower())
|
||||
|
||||
def test_detector_noresult(self):
|
||||
detector = cchardet.Detector()
|
||||
detector.feed('0')
|
||||
eq_(detector.done, False)
|
||||
eq_(detector.result, {"encoding": None, "confidence": None})
|
||||
|
|
Loading…
Reference in a new issue