improve detection accuracy

This commit is contained in:
PyYoshi 2017-03-29 12:46:32 +09:00
parent be2b0affff
commit e6177725fe
3 changed files with 27 additions and 37 deletions

View file

@ -17,7 +17,7 @@ def detect_with_confidence(const_char_ptr msg):
cdef uchardet_t ud = uchardet_new()
cdef int result = uchardet_handle_data(ud, msg, length)
if result != 0:
if result == -1:
uchardet_delete(ud)
raise Exception("Handle data error")
@ -65,13 +65,13 @@ cdef class UniversalDetector:
if length > 0:
result = uchardet_handle_data(self._ud, msg, length)
if result != 0:
if result == -1:
self._closed = 1
uchardet_delete(self._ud)
raise Exception("Handle data error")
else:
elif result == 0:
self._done = 1
uchardet_data_end(self._ud)
self._detected_charset = uchardet_get_charset(self._ud)
self._detected_confidence = uchardet_get_confidence(self._ud)

@ -1 +1 @@
Subproject commit 4361f97af2693e88bd0bc1de76e545b73112d0ce
Subproject commit 2cc0b9aa38605960d98459e64033836cf8b4507a

View file

@ -47,33 +47,23 @@ class TestCChardet():
)
)
# def test_detector(self):
# testfiles = glob.glob('tests/testdata/*/*.txt')
# for testfile in testfiles:
# if testfile.replace("\\", "/") in SKIP_LIST:
# continue
# base = os.path.basename(testfile)
# expected_charset = os.path.splitext(base)[0]
# detector = cchardet.UniversalDetector()
# with open(testfile, 'rb') as f:
# msg = f.read()
# detector.feed(msg)
# # line = f.readline()
# # while line:
# # detector.feed(line)
# # if detector.done:
# # break
# # line = f.readline()
# detector.close()
# detected_encoding = detector.result
# eq_(
# expected_charset.lower(),
# detected_encoding['encoding'].lower(),
# 'Expected %s, but got %s for "%s"' % (
# expected_charset.lower(),
# detected_encoding['encoding'].lower(),
# testfile
# )
# )
def test_detector(self):
detector = cchardet.UniversalDetector()
with open("tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f:
print("===============================")
line = f.readline()
while line:
detector.feed(line)
if detector.done:
break
line = f.readline()
detector.close()
detected_encoding = detector.result
eq_(
"shift_jis",
detected_encoding['encoding'].lower(),
'Expected %s, but got %s' % (
"shift_jis",
detected_encoding['encoding'].lower()
)
)