improve detection accuracy
This commit is contained in:
parent
be2b0affff
commit
e6177725fe
3 changed files with 27 additions and 37 deletions
|
@ -17,7 +17,7 @@ def detect_with_confidence(const_char_ptr msg):
|
||||||
cdef uchardet_t ud = uchardet_new()
|
cdef uchardet_t ud = uchardet_new()
|
||||||
|
|
||||||
cdef int result = uchardet_handle_data(ud, msg, length)
|
cdef int result = uchardet_handle_data(ud, msg, length)
|
||||||
if result != 0:
|
if result == -1:
|
||||||
uchardet_delete(ud)
|
uchardet_delete(ud)
|
||||||
raise Exception("Handle data error")
|
raise Exception("Handle data error")
|
||||||
|
|
||||||
|
@ -65,13 +65,13 @@ cdef class UniversalDetector:
|
||||||
if length > 0:
|
if length > 0:
|
||||||
result = uchardet_handle_data(self._ud, msg, length)
|
result = uchardet_handle_data(self._ud, msg, length)
|
||||||
|
|
||||||
if result != 0:
|
if result == -1:
|
||||||
self._closed = 1
|
self._closed = 1
|
||||||
uchardet_delete(self._ud)
|
uchardet_delete(self._ud)
|
||||||
raise Exception("Handle data error")
|
raise Exception("Handle data error")
|
||||||
else:
|
elif result == 0:
|
||||||
self._done = 1
|
self._done = 1
|
||||||
uchardet_data_end(self._ud)
|
|
||||||
self._detected_charset = uchardet_get_charset(self._ud)
|
self._detected_charset = uchardet_get_charset(self._ud)
|
||||||
self._detected_confidence = uchardet_get_confidence(self._ud)
|
self._detected_confidence = uchardet_get_confidence(self._ud)
|
||||||
|
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 4361f97af2693e88bd0bc1de76e545b73112d0ce
|
Subproject commit 2cc0b9aa38605960d98459e64033836cf8b4507a
|
|
@ -47,33 +47,23 @@ class TestCChardet():
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# def test_detector(self):
|
def test_detector(self):
|
||||||
# testfiles = glob.glob('tests/testdata/*/*.txt')
|
detector = cchardet.UniversalDetector()
|
||||||
# for testfile in testfiles:
|
with open("tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f:
|
||||||
# if testfile.replace("\\", "/") in SKIP_LIST:
|
print("===============================")
|
||||||
# continue
|
line = f.readline()
|
||||||
|
while line:
|
||||||
# base = os.path.basename(testfile)
|
detector.feed(line)
|
||||||
# expected_charset = os.path.splitext(base)[0]
|
if detector.done:
|
||||||
|
break
|
||||||
# detector = cchardet.UniversalDetector()
|
line = f.readline()
|
||||||
# with open(testfile, 'rb') as f:
|
detector.close()
|
||||||
# msg = f.read()
|
detected_encoding = detector.result
|
||||||
# detector.feed(msg)
|
eq_(
|
||||||
# # line = f.readline()
|
"shift_jis",
|
||||||
# # while line:
|
detected_encoding['encoding'].lower(),
|
||||||
# # detector.feed(line)
|
'Expected %s, but got %s' % (
|
||||||
# # if detector.done:
|
"shift_jis",
|
||||||
# # break
|
detected_encoding['encoding'].lower()
|
||||||
# # line = f.readline()
|
)
|
||||||
# detector.close()
|
)
|
||||||
# detected_encoding = detector.result
|
|
||||||
# eq_(
|
|
||||||
# expected_charset.lower(),
|
|
||||||
# detected_encoding['encoding'].lower(),
|
|
||||||
# 'Expected %s, but got %s for "%s"' % (
|
|
||||||
# expected_charset.lower(),
|
|
||||||
# detected_encoding['encoding'].lower(),
|
|
||||||
# testfile
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
|
|
Loading…
Reference in a new issue