support py3k
This commit is contained in:
parent
b871390a3a
commit
54602f6348
3 changed files with 65 additions and 68 deletions
|
@ -13,5 +13,7 @@ def detect(msg):
|
||||||
"confidence": float
|
"confidence": float
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
# TODO: Unicode t = u"あいうえお" があった時の対処 "isinstance(t,unicode) == True"
|
encoding, confidence = _cchardet.detect_with_confidence(msg)
|
||||||
return _cchardet.detect_with_confidence(msg)
|
if isinstance(encoding, bytes):
|
||||||
|
encoding = encoding.decode()
|
||||||
|
return {"encoding": encoding, "confidence": confidence}
|
||||||
|
|
|
@ -41,17 +41,9 @@ def detect_with_confidence(char *msg):
|
||||||
# ref: charsetdetect.cpp
|
# ref: charsetdetect.cpp
|
||||||
if result == 1: # Need more data
|
if result == 1: # Need more data
|
||||||
detected_charset = csd_close2(csd, &confidence)
|
detected_charset = csd_close2(csd, &confidence)
|
||||||
ret = {
|
|
||||||
"encoding":detected_charset,
|
|
||||||
"confidence":confidence
|
|
||||||
}
|
|
||||||
elif result == 0: # Detected early
|
elif result == 0: # Detected early
|
||||||
detected_charset = csd_close2(csd, &confidence)
|
detected_charset = csd_close2(csd, &confidence)
|
||||||
ret = {
|
|
||||||
"encoding":detected_charset,
|
|
||||||
"confidence":confidence
|
|
||||||
}
|
|
||||||
else: # Error, signal with a negative number
|
else: # Error, signal with a negative number
|
||||||
raise Exception("Error, signal with a negative number")
|
raise Exception("Error, signal with a negative number")
|
||||||
return ret
|
return detected_charset,confidence
|
||||||
|
|
||||||
|
|
107
test/tests.py
107
test/tests.py
|
@ -47,16 +47,15 @@ class TestCchardet():
|
||||||
def test_detect_bg_iso88595(self):
|
def test_detect_bg_iso88595(self):
|
||||||
encoding = "ISO-8859-5"
|
encoding = "ISO-8859-5"
|
||||||
path = r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt"
|
path = r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
|
||||||
eq_(encoding.lower(),detected_encoding['encoding'].lower())
|
eq_(encoding.lower(),detected_encoding['encoding'].lower())
|
||||||
|
|
||||||
def test_detect_bg_utf8(self):
|
def test_detect_bg_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt"
|
path = r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -65,7 +64,7 @@ class TestCchardet():
|
||||||
def test_detect_bg_windows1251(self):
|
def test_detect_bg_windows1251(self):
|
||||||
encoding = "WINDOWS-1251"
|
encoding = "WINDOWS-1251"
|
||||||
path = r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt"
|
path = r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -74,7 +73,7 @@ class TestCchardet():
|
||||||
def test_detect_cz_iso88592(self):
|
def test_detect_cz_iso88592(self):
|
||||||
encoding = "ISO-8859-2"
|
encoding = "ISO-8859-2"
|
||||||
path = r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt"
|
path = r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -83,7 +82,7 @@ class TestCchardet():
|
||||||
def test_detect_cz_utf8(self):
|
def test_detect_cz_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt"
|
path = r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -92,7 +91,7 @@ class TestCchardet():
|
||||||
def test_detect_de_utf8(self):
|
def test_detect_de_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/de/UTF-8/wikitop_de_UTF-8.txt"
|
path = r"testdata/de/UTF-8/wikitop_de_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -101,7 +100,7 @@ class TestCchardet():
|
||||||
def test_detect_de_windows1252(self):
|
def test_detect_de_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt"
|
path = r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -110,7 +109,7 @@ class TestCchardet():
|
||||||
def test_detect_dk_utf8(self):
|
def test_detect_dk_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt"
|
path = r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -119,7 +118,7 @@ class TestCchardet():
|
||||||
def test_detect_dk_windows1252(self):
|
def test_detect_dk_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt"
|
path = r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -128,7 +127,7 @@ class TestCchardet():
|
||||||
def test_detect_el_iso88597(self):
|
def test_detect_el_iso88597(self):
|
||||||
encoding = "ISO-8859-7"
|
encoding = "ISO-8859-7"
|
||||||
path = r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt"
|
path = r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -137,7 +136,7 @@ class TestCchardet():
|
||||||
def test_detect_el_utf8(self):
|
def test_detect_el_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/el/UTF-8/wikitop_el_UTF-8.txt"
|
path = r"testdata/el/UTF-8/wikitop_el_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -146,7 +145,7 @@ class TestCchardet():
|
||||||
def test_detect_en_utf8(self):
|
def test_detect_en_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/en/UTF-8/wikitop_en_UTF-8.txt"
|
path = r"testdata/en/UTF-8/wikitop_en_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -155,7 +154,7 @@ class TestCchardet():
|
||||||
def test_detect_en_windows1252(self):
|
def test_detect_en_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt"
|
path = r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -164,7 +163,7 @@ class TestCchardet():
|
||||||
def test_detect_es_utf8(self):
|
def test_detect_es_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/es/UTF-8/wikitop_es_UTF-8.txt"
|
path = r"testdata/es/UTF-8/wikitop_es_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -173,7 +172,7 @@ class TestCchardet():
|
||||||
def test_detect_es_windows1252(self):
|
def test_detect_es_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt"
|
path = r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -182,7 +181,7 @@ class TestCchardet():
|
||||||
def test_detect_fi_utf8(self):
|
def test_detect_fi_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt"
|
path = r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -191,7 +190,7 @@ class TestCchardet():
|
||||||
def test_detect_fi_windows1252(self):
|
def test_detect_fi_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt"
|
path = r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -200,7 +199,7 @@ class TestCchardet():
|
||||||
def test_detect_fr_utf8(self):
|
def test_detect_fr_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt"
|
path = r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -209,7 +208,7 @@ class TestCchardet():
|
||||||
def test_detect_fr_windows1252(self):
|
def test_detect_fr_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt"
|
path = r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -218,7 +217,7 @@ class TestCchardet():
|
||||||
def test_detect_he_utf8(self):
|
def test_detect_he_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/he/UTF-8/wikitop_he_UTF-8.txt"
|
path = r"testdata/he/UTF-8/wikitop_he_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -227,7 +226,7 @@ class TestCchardet():
|
||||||
def test_detect_he_windows1255(self):
|
def test_detect_he_windows1255(self):
|
||||||
encoding = "WINDOWS-1255"
|
encoding = "WINDOWS-1255"
|
||||||
path = r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt"
|
path = r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -236,7 +235,7 @@ class TestCchardet():
|
||||||
def test_detect_hu_utf8(self):
|
def test_detect_hu_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt"
|
path = r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -245,7 +244,7 @@ class TestCchardet():
|
||||||
def test_detect_hu_iso55892(self):
|
def test_detect_hu_iso55892(self):
|
||||||
encoding = "ISO-8859-2"
|
encoding = "ISO-8859-2"
|
||||||
path = r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt"
|
path = r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -254,7 +253,7 @@ class TestCchardet():
|
||||||
def test_detect_it_utf8(self):
|
def test_detect_it_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/it/UTF-8/wikitop_it_UTF-8.txt"
|
path = r"testdata/it/UTF-8/wikitop_it_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -263,7 +262,7 @@ class TestCchardet():
|
||||||
def test_detect_it_windows1252(self):
|
def test_detect_it_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt"
|
path = r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -272,7 +271,7 @@ class TestCchardet():
|
||||||
def test_detect_nl_utf8(self):
|
def test_detect_nl_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt"
|
path = r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -281,7 +280,7 @@ class TestCchardet():
|
||||||
def test_detect_nl_windows1252(self):
|
def test_detect_nl_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt"
|
path = r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -290,7 +289,7 @@ class TestCchardet():
|
||||||
def test_detect_no_utf8(self):
|
def test_detect_no_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/no/UTF-8/wikitop_no_UTF-8.txt"
|
path = r"testdata/no/UTF-8/wikitop_no_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -299,7 +298,7 @@ class TestCchardet():
|
||||||
def test_detect_no_windows1252(self):
|
def test_detect_no_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt"
|
path = r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -308,7 +307,7 @@ class TestCchardet():
|
||||||
def test_detect_pl_utf8(self):
|
def test_detect_pl_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt"
|
path = r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -317,7 +316,7 @@ class TestCchardet():
|
||||||
def test_detect_pl_iso88592(self):
|
def test_detect_pl_iso88592(self):
|
||||||
encoding = "ISO-8859-2"
|
encoding = "ISO-8859-2"
|
||||||
path = r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt"
|
path = r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -326,7 +325,7 @@ class TestCchardet():
|
||||||
def test_detect_pt_utf8(self):
|
def test_detect_pt_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt"
|
path = r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -335,7 +334,7 @@ class TestCchardet():
|
||||||
def test_detect_pt_windows1252(self):
|
def test_detect_pt_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt"
|
path = r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -344,7 +343,7 @@ class TestCchardet():
|
||||||
def test_detect_ru_utf8(self):
|
def test_detect_ru_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt"
|
path = r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -353,7 +352,7 @@ class TestCchardet():
|
||||||
def test_detect_ru_windows1251(self):
|
def test_detect_ru_windows1251(self):
|
||||||
encoding = "WINDOWS-1251"
|
encoding = "WINDOWS-1251"
|
||||||
path = r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt"
|
path = r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -362,7 +361,7 @@ class TestCchardet():
|
||||||
def test_detect_ru_ibm855(self):
|
def test_detect_ru_ibm855(self):
|
||||||
encoding = "IBM855"
|
encoding = "IBM855"
|
||||||
path = r"testdata/ru/IBM855/wikitop_ru_IBM855.txt"
|
path = r"testdata/ru/IBM855/wikitop_ru_IBM855.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -371,7 +370,7 @@ class TestCchardet():
|
||||||
def test_detect_ru_koi8r(self):
|
def test_detect_ru_koi8r(self):
|
||||||
encoding = "KOI8-R"
|
encoding = "KOI8-R"
|
||||||
path = r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt"
|
path = r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -380,7 +379,7 @@ class TestCchardet():
|
||||||
def test_detect_se_utf8(self):
|
def test_detect_se_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/se/UTF-8/wikitop_se_UTF-8.txt"
|
path = r"testdata/se/UTF-8/wikitop_se_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -389,7 +388,7 @@ class TestCchardet():
|
||||||
def test_detect_se_windows1252(self):
|
def test_detect_se_windows1252(self):
|
||||||
encoding = "WINDOWS-1252"
|
encoding = "WINDOWS-1252"
|
||||||
path = r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt"
|
path = r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -398,7 +397,7 @@ class TestCchardet():
|
||||||
def test_detect_th_utf8(self):
|
def test_detect_th_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/th/UTF-8/wikitop_th_UTF-8.txt"
|
path = r"testdata/th/UTF-8/wikitop_th_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -407,7 +406,7 @@ class TestCchardet():
|
||||||
def test_detect_th_tis620_1(self):
|
def test_detect_th_tis620_1(self):
|
||||||
encoding = "TIS-620"
|
encoding = "TIS-620"
|
||||||
path = r"testdata/th/TIS-620/utffool_th_TIS-620.txt"
|
path = r"testdata/th/TIS-620/utffool_th_TIS-620.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -416,7 +415,7 @@ class TestCchardet():
|
||||||
def test_detect_th_tis620_2(self):
|
def test_detect_th_tis620_2(self):
|
||||||
encoding = "TIS-620"
|
encoding = "TIS-620"
|
||||||
path = r"testdata/th/TIS-620/wikitop_th_TIS-620.txt"
|
path = r"testdata/th/TIS-620/wikitop_th_TIS-620.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -425,7 +424,7 @@ class TestCchardet():
|
||||||
def test_detect_tr_utf8(self):
|
def test_detect_tr_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt"
|
path = r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -434,7 +433,7 @@ class TestCchardet():
|
||||||
def test_detect_tr_iso88599(self):
|
def test_detect_tr_iso88599(self):
|
||||||
encoding = "ISO-8859-9"
|
encoding = "ISO-8859-9"
|
||||||
path = r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt"
|
path = r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -443,7 +442,7 @@ class TestCchardet():
|
||||||
def test_detect_zh_utf8(self):
|
def test_detect_zh_utf8(self):
|
||||||
encoding = "UTF-8"
|
encoding = "UTF-8"
|
||||||
path = r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt"
|
path = r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
print(detected_encoding)
|
||||||
|
@ -452,31 +451,35 @@ class TestCchardet():
|
||||||
def test_detect_zh_gb18030(self):
|
def test_detect_zh_gb18030(self):
|
||||||
encoding = "GB18030"
|
encoding = "GB18030"
|
||||||
path = r"testdata/zh/GB18030/wikitop_zh_GB18030.txt"
|
path = r"testdata/zh/GB18030/wikitop_zh_GB18030.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
detected_encoding = cchardet.detect(msg)
|
detected_encoding = cchardet.detect(msg)
|
||||||
print(detected_encoding)
|
|
||||||
eq_(encoding.lower(),detected_encoding['encoding'].lower())
|
eq_(encoding.lower(),detected_encoding['encoding'].lower())
|
||||||
|
|
||||||
class TestCchardetSpeed():
|
class TestCchardetSpeed():
|
||||||
def test_speed(self):
|
def test_speed(self):
|
||||||
|
try:
|
||||||
import chardet
|
import chardet
|
||||||
|
has_chardet = True
|
||||||
|
except ImportError:
|
||||||
|
has_chardet = False
|
||||||
import time
|
import time
|
||||||
do_times = 5
|
do_times = 5
|
||||||
path = r"testdata/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt"
|
path = r"testdata/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt"
|
||||||
with open(path) as f:
|
with open(path, 'rb') as f:
|
||||||
msg = f.read()
|
msg = f.read()
|
||||||
# Test chardet
|
# Test chardet
|
||||||
|
if has_chardet:
|
||||||
result_chardet = 0
|
result_chardet = 0
|
||||||
for i in xrange(do_times):
|
for i in range(do_times):
|
||||||
start_chardet = time.time()
|
start_chardet = time.time()
|
||||||
chardet.detect(msg)
|
chardet.detect(msg)
|
||||||
result_chardet += (time.time() - start_chardet)
|
result_chardet += (time.time() - start_chardet)
|
||||||
|
print('chardet:',1/(result_chardet/do_times), 'call(s)/s')
|
||||||
# Test cchardet
|
# Test cchardet
|
||||||
result_cchardet = 0
|
result_cchardet = 0
|
||||||
for i in xrange(do_times):
|
for i in range(do_times):
|
||||||
start_cchardet = time.time()
|
start_cchardet = time.time()
|
||||||
cchardet.detect(msg)
|
cchardet.detect(msg)
|
||||||
result_cchardet += (time.time() - start_cchardet)
|
result_cchardet += (time.time() - start_cchardet)
|
||||||
print('chardet:',1/(result_chardet/do_times), 'call(s)/s')
|
|
||||||
print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
|
print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
|
Loading…
Reference in a new issue