add test
This commit is contained in:
parent
d385fdaa9f
commit
f42e91a616
5 changed files with 34 additions and 4 deletions
|
@ -1,6 +1,12 @@
|
||||||
CHANGES
|
CHANGES
|
||||||
=======
|
=======
|
||||||
|
|
||||||
|
2.1.x
|
||||||
|
-----
|
||||||
|
|
||||||
|
- update language models (uchardet)
|
||||||
|
- add iso8859-2 test but disabled it
|
||||||
|
|
||||||
2.1.4 (2018-09-27)
|
2.1.4 (2018-09-27)
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ def detect(msg):
|
||||||
encoding = encoding.decode()
|
encoding = encoding.decode()
|
||||||
return {"encoding": encoding, "confidence": confidence}
|
return {"encoding": encoding, "confidence": confidence}
|
||||||
|
|
||||||
|
|
||||||
class UniversalDetector(object):
|
class UniversalDetector(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._detector = _cchardet.UniversalDetector()
|
self._detector = _cchardet.UniversalDetector()
|
||||||
|
|
|
@ -16,7 +16,8 @@ def main():
|
||||||
start_chardet = time.time()
|
start_chardet = time.time()
|
||||||
chardet.detect(msg)
|
chardet.detect(msg)
|
||||||
result_chardet += (time.time() - start_chardet)
|
result_chardet += (time.time() - start_chardet)
|
||||||
print('chardet v%s:' % (chardet.__version__), 1 / (result_chardet / do_times), 'call(s)/s')
|
print('chardet v%s:' % (chardet.__version__), 1 /
|
||||||
|
(result_chardet / do_times), 'call(s)/s')
|
||||||
|
|
||||||
# Test cchardet
|
# Test cchardet
|
||||||
result_cchardet = 0
|
result_cchardet = 0
|
||||||
|
@ -24,7 +25,9 @@ def main():
|
||||||
start_cchardet = time.time()
|
start_cchardet = time.time()
|
||||||
cchardet.detect(msg)
|
cchardet.detect(msg)
|
||||||
result_cchardet += (time.time() - start_cchardet)
|
result_cchardet += (time.time() - start_cchardet)
|
||||||
print('cchardet v%s:' % (cchardet.__version__), 1 / (result_cchardet / do_times), 'call(s)/s')
|
print('cchardet v%s:' % (cchardet.__version__),
|
||||||
|
1 / (result_cchardet / do_times), 'call(s)/s')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
3
src/tests/samples/iso8859-2.csv
Normal file
3
src/tests/samples/iso8859-2.csv
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
id,name
|
||||||
|
1,english
|
||||||
|
2,©
|
|
|
@ -14,6 +14,7 @@ SKIP_LIST = [
|
||||||
'tests/testdata/he/iso-8859-8.txt'
|
'tests/testdata/he/iso-8859-8.txt'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# Python can't decode encoding
|
# Python can't decode encoding
|
||||||
SKIP_LIST_02 = [
|
SKIP_LIST_02 = [
|
||||||
'tests/testdata/vi/viscii.txt',
|
'tests/testdata/vi/viscii.txt',
|
||||||
|
@ -21,6 +22,7 @@ SKIP_LIST_02 = [
|
||||||
]
|
]
|
||||||
SKIP_LIST_02.extend(SKIP_LIST)
|
SKIP_LIST_02.extend(SKIP_LIST)
|
||||||
|
|
||||||
|
|
||||||
class TestCChardet():
|
class TestCChardet():
|
||||||
def test_ascii(self):
|
def test_ascii(self):
|
||||||
detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz')
|
detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz')
|
||||||
|
@ -100,7 +102,8 @@ class TestCChardet():
|
||||||
try:
|
try:
|
||||||
msg.decode(detected_encoding["encoding"])
|
msg.decode(detected_encoding["encoding"])
|
||||||
except LookupError as e:
|
except LookupError as e:
|
||||||
print("LookupError: { file=%s, encoding=%s }" % (testfile, detected_encoding["encoding"]))
|
print("LookupError: { file=%s, encoding=%s }" % (
|
||||||
|
testfile, detected_encoding["encoding"]))
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def test_utf8_with_bom(self):
|
def test_utf8_with_bom(self):
|
||||||
|
@ -126,3 +129,17 @@ class TestCChardet():
|
||||||
detected_encoding['encoding']
|
detected_encoding['encoding']
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# def test_iso8859_2_csv(self):
|
||||||
|
# testfile = 'tests/samples/iso8859-2.csv'
|
||||||
|
# with open(testfile, 'rb') as f:
|
||||||
|
# msg = f.read()
|
||||||
|
# detected_encoding = cchardet.detect(msg)
|
||||||
|
# eq_(
|
||||||
|
# "iso8859-2",
|
||||||
|
# detected_encoding['encoding'].lower(),
|
||||||
|
# 'Expected %s, but got %s' % (
|
||||||
|
# "iso8859-2",
|
||||||
|
# detected_encoding['encoding'].lower()
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
Loading…
Reference in a new issue