improve detection accuracy

This commit is contained in:
PyYoshi 2017-03-29 12:46:32 +09:00
parent be2b0affff
commit e6177725fe
3 changed files with 27 additions and 37 deletions

View file

@ -17,7 +17,7 @@ def detect_with_confidence(const_char_ptr msg):
cdef uchardet_t ud = uchardet_new() cdef uchardet_t ud = uchardet_new()
cdef int result = uchardet_handle_data(ud, msg, length) cdef int result = uchardet_handle_data(ud, msg, length)
if result != 0: if result == -1:
uchardet_delete(ud) uchardet_delete(ud)
raise Exception("Handle data error") raise Exception("Handle data error")
@ -65,15 +65,15 @@ cdef class UniversalDetector:
if length > 0: if length > 0:
result = uchardet_handle_data(self._ud, msg, length) result = uchardet_handle_data(self._ud, msg, length)
if result != 0: if result == -1:
self._closed = 1 self._closed = 1
uchardet_delete(self._ud) uchardet_delete(self._ud)
raise Exception("Handle data error") raise Exception("Handle data error")
else: elif result == 0:
self._done = 1 self._done = 1
uchardet_data_end(self._ud)
self._detected_charset = uchardet_get_charset(self._ud) self._detected_charset = uchardet_get_charset(self._ud)
self._detected_confidence = uchardet_get_confidence(self._ud) self._detected_confidence = uchardet_get_confidence(self._ud)
def close(self): def close(self):
if not self._closed: if not self._closed:

@ -1 +1 @@
Subproject commit 4361f97af2693e88bd0bc1de76e545b73112d0ce Subproject commit 2cc0b9aa38605960d98459e64033836cf8b4507a

View file

@ -47,33 +47,23 @@ class TestCChardet():
) )
) )
# def test_detector(self): def test_detector(self):
# testfiles = glob.glob('tests/testdata/*/*.txt') detector = cchardet.UniversalDetector()
# for testfile in testfiles: with open("tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f:
# if testfile.replace("\\", "/") in SKIP_LIST: print("===============================")
# continue line = f.readline()
while line:
# base = os.path.basename(testfile) detector.feed(line)
# expected_charset = os.path.splitext(base)[0] if detector.done:
break
# detector = cchardet.UniversalDetector() line = f.readline()
# with open(testfile, 'rb') as f: detector.close()
# msg = f.read() detected_encoding = detector.result
# detector.feed(msg) eq_(
# # line = f.readline() "shift_jis",
# # while line: detected_encoding['encoding'].lower(),
# # detector.feed(line) 'Expected %s, but got %s' % (
# # if detector.done: "shift_jis",
# # break detected_encoding['encoding'].lower()
# # line = f.readline() )
# detector.close() )
# detected_encoding = detector.result
# eq_(
# expected_charset.lower(),
# detected_encoding['encoding'].lower(),
# 'Expected %s, but got %s for "%s"' % (
# expected_charset.lower(),
# detected_encoding['encoding'].lower(),
# testfile
# )
# )