improve detection accuracy
This commit is contained in:
parent
be2b0affff
commit
e6177725fe
3 changed files with 27 additions and 37 deletions
|
@ -17,7 +17,7 @@ def detect_with_confidence(const_char_ptr msg):
|
|||
cdef uchardet_t ud = uchardet_new()
|
||||
|
||||
cdef int result = uchardet_handle_data(ud, msg, length)
|
||||
if result != 0:
|
||||
if result == -1:
|
||||
uchardet_delete(ud)
|
||||
raise Exception("Handle data error")
|
||||
|
||||
|
@ -65,13 +65,13 @@ cdef class UniversalDetector:
|
|||
if length > 0:
|
||||
result = uchardet_handle_data(self._ud, msg, length)
|
||||
|
||||
if result != 0:
|
||||
if result == -1:
|
||||
self._closed = 1
|
||||
uchardet_delete(self._ud)
|
||||
raise Exception("Handle data error")
|
||||
else:
|
||||
elif result == 0:
|
||||
self._done = 1
|
||||
uchardet_data_end(self._ud)
|
||||
|
||||
self._detected_charset = uchardet_get_charset(self._ud)
|
||||
self._detected_confidence = uchardet_get_confidence(self._ud)
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 4361f97af2693e88bd0bc1de76e545b73112d0ce
|
||||
Subproject commit 2cc0b9aa38605960d98459e64033836cf8b4507a
|
|
@ -47,33 +47,23 @@ class TestCChardet():
|
|||
)
|
||||
)
|
||||
|
||||
# def test_detector(self):
|
||||
# testfiles = glob.glob('tests/testdata/*/*.txt')
|
||||
# for testfile in testfiles:
|
||||
# if testfile.replace("\\", "/") in SKIP_LIST:
|
||||
# continue
|
||||
|
||||
# base = os.path.basename(testfile)
|
||||
# expected_charset = os.path.splitext(base)[0]
|
||||
|
||||
# detector = cchardet.UniversalDetector()
|
||||
# with open(testfile, 'rb') as f:
|
||||
# msg = f.read()
|
||||
# detector.feed(msg)
|
||||
# # line = f.readline()
|
||||
# # while line:
|
||||
# # detector.feed(line)
|
||||
# # if detector.done:
|
||||
# # break
|
||||
# # line = f.readline()
|
||||
# detector.close()
|
||||
# detected_encoding = detector.result
|
||||
# eq_(
|
||||
# expected_charset.lower(),
|
||||
# detected_encoding['encoding'].lower(),
|
||||
# 'Expected %s, but got %s for "%s"' % (
|
||||
# expected_charset.lower(),
|
||||
# detected_encoding['encoding'].lower(),
|
||||
# testfile
|
||||
# )
|
||||
# )
|
||||
def test_detector(self):
|
||||
detector = cchardet.UniversalDetector()
|
||||
with open("tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f:
|
||||
print("===============================")
|
||||
line = f.readline()
|
||||
while line:
|
||||
detector.feed(line)
|
||||
if detector.done:
|
||||
break
|
||||
line = f.readline()
|
||||
detector.close()
|
||||
detected_encoding = detector.result
|
||||
eq_(
|
||||
"shift_jis",
|
||||
detected_encoding['encoding'].lower(),
|
||||
'Expected %s, but got %s' % (
|
||||
"shift_jis",
|
||||
detected_encoding['encoding'].lower()
|
||||
)
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue