diff --git a/CHANGES.rst b/CHANGES.rst index edcf1da..6fec0b8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ CHANGES ======= +2.1.x +----- + +- update language models (uchardet) +- add iso8859-2 test but disabled it + 2.1.4 (2018-09-27) ------------------ diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py index fa83d73..c6db442 100644 --- a/src/cchardet/__init__.py +++ b/src/cchardet/__init__.py @@ -17,6 +17,7 @@ def detect(msg): encoding = encoding.decode() return {"encoding": encoding, "confidence": confidence} + class UniversalDetector(object): def __init__(self): self._detector = _cchardet.UniversalDetector() diff --git a/src/tests/bench.py b/src/tests/bench.py index a111856..53db2cb 100644 --- a/src/tests/bench.py +++ b/src/tests/bench.py @@ -16,7 +16,8 @@ def main(): start_chardet = time.time() chardet.detect(msg) result_chardet += (time.time() - start_chardet) - print('chardet v%s:' % (chardet.__version__), 1 / (result_chardet / do_times), 'call(s)/s') + print('chardet v%s:' % (chardet.__version__), 1 / + (result_chardet / do_times), 'call(s)/s') # Test cchardet result_cchardet = 0 @@ -24,7 +25,9 @@ def main(): start_cchardet = time.time() cchardet.detect(msg) result_cchardet += (time.time() - start_cchardet) - print('cchardet v%s:' % (cchardet.__version__), 1 / (result_cchardet / do_times), 'call(s)/s') + print('cchardet v%s:' % (cchardet.__version__), + 1 / (result_cchardet / do_times), 'call(s)/s') + if __name__ == '__main__': main() diff --git a/src/tests/samples/iso8859-2.csv b/src/tests/samples/iso8859-2.csv new file mode 100644 index 0000000..c24ffa2 --- /dev/null +++ b/src/tests/samples/iso8859-2.csv @@ -0,0 +1,3 @@ +id,name +1,english +2,© diff --git a/src/tests/test.py b/src/tests/test.py index 2723ce0..b0b94b5 100644 --- a/src/tests/test.py +++ b/src/tests/test.py @@ -14,6 +14,7 @@ SKIP_LIST = [ 'tests/testdata/he/iso-8859-8.txt' ] + # Python can't decode encoding SKIP_LIST_02 = [ 'tests/testdata/vi/viscii.txt', @@ -21,6 +22,7 @@ SKIP_LIST_02 = [ ] SKIP_LIST_02.extend(SKIP_LIST) + class TestCChardet(): def test_ascii(self): detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz') @@ -73,7 +75,7 @@ class TestCChardet(): detected_encoding['encoding'].lower() ) ) - + def test_github_issue_20(self): """ https://github.com/PyYoshi/cChardet/issues/20 @@ -100,7 +102,8 @@ class TestCChardet(): try: msg.decode(detected_encoding["encoding"]) except LookupError as e: - print("LookupError: { file=%s, encoding=%s }" % (testfile, detected_encoding["encoding"])) + print("LookupError: { file=%s, encoding=%s }" % ( + testfile, detected_encoding["encoding"])) raise e def test_utf8_with_bom(self): @@ -126,3 +129,17 @@ class TestCChardet(): detected_encoding['encoding'] ) ) + + # def test_iso8859_2_csv(self): + # testfile = 'tests/samples/iso8859-2.csv' + # with open(testfile, 'rb') as f: + # msg = f.read() + # detected_encoding = cchardet.detect(msg) + # eq_( + # "iso8859-2", + # detected_encoding['encoding'].lower(), + # 'Expected %s, but got %s' % ( + # "iso8859-2", + # detected_encoding['encoding'].lower() + # ) + # )