This commit is contained in:
PyYoshi 2018-09-27 19:20:19 +09:00
parent d385fdaa9f
commit f42e91a616
5 changed files with 34 additions and 4 deletions

View file

@ -1,6 +1,12 @@
CHANGES CHANGES
======= =======
2.1.x
-----
- update language models (uchardet)
- add iso8859-2 test but disabled it
2.1.4 (2018-09-27) 2.1.4 (2018-09-27)
------------------ ------------------

View file

@ -17,6 +17,7 @@ def detect(msg):
encoding = encoding.decode() encoding = encoding.decode()
return {"encoding": encoding, "confidence": confidence} return {"encoding": encoding, "confidence": confidence}
class UniversalDetector(object): class UniversalDetector(object):
def __init__(self): def __init__(self):
self._detector = _cchardet.UniversalDetector() self._detector = _cchardet.UniversalDetector()

View file

@ -16,7 +16,8 @@ def main():
start_chardet = time.time() start_chardet = time.time()
chardet.detect(msg) chardet.detect(msg)
result_chardet += (time.time() - start_chardet) result_chardet += (time.time() - start_chardet)
print('chardet v%s:' % (chardet.__version__), 1 / (result_chardet / do_times), 'call(s)/s') print('chardet v%s:' % (chardet.__version__), 1 /
(result_chardet / do_times), 'call(s)/s')
# Test cchardet # Test cchardet
result_cchardet = 0 result_cchardet = 0
@ -24,7 +25,9 @@ def main():
start_cchardet = time.time() start_cchardet = time.time()
cchardet.detect(msg) cchardet.detect(msg)
result_cchardet += (time.time() - start_cchardet) result_cchardet += (time.time() - start_cchardet)
print('cchardet v%s:' % (cchardet.__version__), 1 / (result_cchardet / do_times), 'call(s)/s') print('cchardet v%s:' % (cchardet.__version__),
1 / (result_cchardet / do_times), 'call(s)/s')
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -0,0 +1,3 @@
id,name
1,english
2,©
1 id name
2 1 english
3 2 ©

View file

@ -14,6 +14,7 @@ SKIP_LIST = [
'tests/testdata/he/iso-8859-8.txt' 'tests/testdata/he/iso-8859-8.txt'
] ]
# Python can't decode encoding # Python can't decode encoding
SKIP_LIST_02 = [ SKIP_LIST_02 = [
'tests/testdata/vi/viscii.txt', 'tests/testdata/vi/viscii.txt',
@ -21,6 +22,7 @@ SKIP_LIST_02 = [
] ]
SKIP_LIST_02.extend(SKIP_LIST) SKIP_LIST_02.extend(SKIP_LIST)
class TestCChardet(): class TestCChardet():
def test_ascii(self): def test_ascii(self):
detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz') detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz')
@ -73,7 +75,7 @@ class TestCChardet():
detected_encoding['encoding'].lower() detected_encoding['encoding'].lower()
) )
) )
def test_github_issue_20(self): def test_github_issue_20(self):
""" """
https://github.com/PyYoshi/cChardet/issues/20 https://github.com/PyYoshi/cChardet/issues/20
@ -100,7 +102,8 @@ class TestCChardet():
try: try:
msg.decode(detected_encoding["encoding"]) msg.decode(detected_encoding["encoding"])
except LookupError as e: except LookupError as e:
print("LookupError: { file=%s, encoding=%s }" % (testfile, detected_encoding["encoding"])) print("LookupError: { file=%s, encoding=%s }" % (
testfile, detected_encoding["encoding"]))
raise e raise e
def test_utf8_with_bom(self): def test_utf8_with_bom(self):
@ -126,3 +129,17 @@ class TestCChardet():
detected_encoding['encoding'] detected_encoding['encoding']
) )
) )
# def test_iso8859_2_csv(self):
# testfile = 'tests/samples/iso8859-2.csv'
# with open(testfile, 'rb') as f:
# msg = f.read()
# detected_encoding = cchardet.detect(msg)
# eq_(
# "iso8859-2",
# detected_encoding['encoding'].lower(),
# 'Expected %s, but got %s' % (
# "iso8859-2",
# detected_encoding['encoding'].lower()
# )
# )