From e6177725fe0a9defb4e0cae8623ad4d6773d6cf0 Mon Sep 17 00:00:00 2001 From: PyYoshi Date: Wed, 29 Mar 2017 12:46:32 +0900 Subject: [PATCH] improve detection accuracy --- src/cchardet/_cchardet.pyx | 12 ++++----- src/ext/uchardet | 2 +- src/tests/test.py | 50 +++++++++++++++----------------------- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/src/cchardet/_cchardet.pyx b/src/cchardet/_cchardet.pyx index b1e3123..690d5f3 100644 --- a/src/cchardet/_cchardet.pyx +++ b/src/cchardet/_cchardet.pyx @@ -17,7 +17,7 @@ def detect_with_confidence(const_char_ptr msg): cdef uchardet_t ud = uchardet_new() cdef int result = uchardet_handle_data(ud, msg, length) - if result != 0: + if result == -1: uchardet_delete(ud) raise Exception("Handle data error") @@ -65,15 +65,15 @@ cdef class UniversalDetector: if length > 0: result = uchardet_handle_data(self._ud, msg, length) - if result != 0: + if result == -1: self._closed = 1 uchardet_delete(self._ud) raise Exception("Handle data error") - else: + elif result == 0: self._done = 1 - uchardet_data_end(self._ud) - self._detected_charset = uchardet_get_charset(self._ud) - self._detected_confidence = uchardet_get_confidence(self._ud) + + self._detected_charset = uchardet_get_charset(self._ud) + self._detected_confidence = uchardet_get_confidence(self._ud) def close(self): if not self._closed: diff --git a/src/ext/uchardet b/src/ext/uchardet index 4361f97..2cc0b9a 160000 --- a/src/ext/uchardet +++ b/src/ext/uchardet @@ -1 +1 @@ -Subproject commit 4361f97af2693e88bd0bc1de76e545b73112d0ce +Subproject commit 2cc0b9aa38605960d98459e64033836cf8b4507a diff --git a/src/tests/test.py b/src/tests/test.py index f725127..ccb0ceb 100644 --- a/src/tests/test.py +++ b/src/tests/test.py @@ -47,33 +47,23 @@ class TestCChardet(): ) ) - # def test_detector(self): - # testfiles = glob.glob('tests/testdata/*/*.txt') - # for testfile in testfiles: - # if testfile.replace("\\", "/") in SKIP_LIST: - # continue - - # base = os.path.basename(testfile) - # expected_charset = os.path.splitext(base)[0] - - # detector = cchardet.UniversalDetector() - # with open(testfile, 'rb') as f: - # msg = f.read() - # detector.feed(msg) - # # line = f.readline() - # # while line: - # # detector.feed(line) - # # if detector.done: - # # break - # # line = f.readline() - # detector.close() - # detected_encoding = detector.result - # eq_( - # expected_charset.lower(), - # detected_encoding['encoding'].lower(), - # 'Expected %s, but got %s for "%s"' % ( - # expected_charset.lower(), - # detected_encoding['encoding'].lower(), - # testfile - # ) - # ) + def test_detector(self): + detector = cchardet.UniversalDetector() + with open("tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f: + print("===============================") + line = f.readline() + while line: + detector.feed(line) + if detector.done: + break + line = f.readline() + detector.close() + detected_encoding = detector.result + eq_( + "shift_jis", + detected_encoding['encoding'].lower(), + 'Expected %s, but got %s' % ( + "shift_jis", + detected_encoding['encoding'].lower() + ) + )