tests: change speed test and the way to open file

This commit is contained in:
PyYoshi 2013-05-08 12:24:28 +09:00
parent a69598fe75
commit 9879cf926a

View file

@ -47,7 +47,8 @@ class TestCchardet():
def test_detect_bg_iso88595(self): def test_detect_bg_iso88595(self):
encoding = "ISO-8859-5" encoding = "ISO-8859-5"
path = r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt" path = r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -55,7 +56,8 @@ class TestCchardet():
def test_detect_bg_utf8(self): def test_detect_bg_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt" path = r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -63,7 +65,8 @@ class TestCchardet():
def test_detect_bg_windows1251(self): def test_detect_bg_windows1251(self):
encoding = "WINDOWS-1251" encoding = "WINDOWS-1251"
path = r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt" path = r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -71,7 +74,8 @@ class TestCchardet():
def test_detect_cz_iso88592(self): def test_detect_cz_iso88592(self):
encoding = "ISO-8859-2" encoding = "ISO-8859-2"
path = r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt" path = r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -79,7 +83,8 @@ class TestCchardet():
def test_detect_cz_utf8(self): def test_detect_cz_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt" path = r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -87,7 +92,8 @@ class TestCchardet():
def test_detect_de_utf8(self): def test_detect_de_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/de/UTF-8/wikitop_de_UTF-8.txt" path = r"testdata/de/UTF-8/wikitop_de_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -95,7 +101,8 @@ class TestCchardet():
def test_detect_de_windows1252(self): def test_detect_de_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt" path = r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -103,7 +110,8 @@ class TestCchardet():
def test_detect_dk_utf8(self): def test_detect_dk_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt" path = r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -111,7 +119,8 @@ class TestCchardet():
def test_detect_dk_windows1252(self): def test_detect_dk_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt" path = r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -119,7 +128,8 @@ class TestCchardet():
def test_detect_el_iso88597(self): def test_detect_el_iso88597(self):
encoding = "ISO-8859-7" encoding = "ISO-8859-7"
path = r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt" path = r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -127,7 +137,8 @@ class TestCchardet():
def test_detect_el_utf8(self): def test_detect_el_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/el/UTF-8/wikitop_el_UTF-8.txt" path = r"testdata/el/UTF-8/wikitop_el_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -135,7 +146,8 @@ class TestCchardet():
def test_detect_en_utf8(self): def test_detect_en_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/en/UTF-8/wikitop_en_UTF-8.txt" path = r"testdata/en/UTF-8/wikitop_en_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -143,7 +155,8 @@ class TestCchardet():
def test_detect_en_windows1252(self): def test_detect_en_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt" path = r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -151,7 +164,8 @@ class TestCchardet():
def test_detect_es_utf8(self): def test_detect_es_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/es/UTF-8/wikitop_es_UTF-8.txt" path = r"testdata/es/UTF-8/wikitop_es_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -159,7 +173,8 @@ class TestCchardet():
def test_detect_es_windows1252(self): def test_detect_es_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt" path = r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -167,7 +182,8 @@ class TestCchardet():
def test_detect_fi_utf8(self): def test_detect_fi_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt" path = r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -175,7 +191,8 @@ class TestCchardet():
def test_detect_fi_windows1252(self): def test_detect_fi_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt" path = r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -183,7 +200,8 @@ class TestCchardet():
def test_detect_fr_utf8(self): def test_detect_fr_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt" path = r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -191,7 +209,8 @@ class TestCchardet():
def test_detect_fr_windows1252(self): def test_detect_fr_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt" path = r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -199,7 +218,8 @@ class TestCchardet():
def test_detect_he_utf8(self): def test_detect_he_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/he/UTF-8/wikitop_he_UTF-8.txt" path = r"testdata/he/UTF-8/wikitop_he_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -207,7 +227,8 @@ class TestCchardet():
def test_detect_he_windows1255(self): def test_detect_he_windows1255(self):
encoding = "WINDOWS-1255" encoding = "WINDOWS-1255"
path = r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt" path = r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -215,7 +236,8 @@ class TestCchardet():
def test_detect_hu_utf8(self): def test_detect_hu_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt" path = r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -223,7 +245,8 @@ class TestCchardet():
def test_detect_hu_iso55892(self): def test_detect_hu_iso55892(self):
encoding = "ISO-8859-2" encoding = "ISO-8859-2"
path = r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt" path = r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -231,7 +254,8 @@ class TestCchardet():
def test_detect_it_utf8(self): def test_detect_it_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/it/UTF-8/wikitop_it_UTF-8.txt" path = r"testdata/it/UTF-8/wikitop_it_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -239,7 +263,8 @@ class TestCchardet():
def test_detect_it_windows1252(self): def test_detect_it_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt" path = r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -247,7 +272,8 @@ class TestCchardet():
def test_detect_nl_utf8(self): def test_detect_nl_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt" path = r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -255,7 +281,8 @@ class TestCchardet():
def test_detect_nl_windows1252(self): def test_detect_nl_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt" path = r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -263,7 +290,8 @@ class TestCchardet():
def test_detect_no_utf8(self): def test_detect_no_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/no/UTF-8/wikitop_no_UTF-8.txt" path = r"testdata/no/UTF-8/wikitop_no_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -271,7 +299,8 @@ class TestCchardet():
def test_detect_no_windows1252(self): def test_detect_no_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt" path = r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -279,7 +308,8 @@ class TestCchardet():
def test_detect_pl_utf8(self): def test_detect_pl_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt" path = r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -287,7 +317,8 @@ class TestCchardet():
def test_detect_pl_iso88592(self): def test_detect_pl_iso88592(self):
encoding = "ISO-8859-2" encoding = "ISO-8859-2"
path = r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt" path = r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -295,7 +326,8 @@ class TestCchardet():
def test_detect_pt_utf8(self): def test_detect_pt_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt" path = r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -303,7 +335,8 @@ class TestCchardet():
def test_detect_pt_windows1252(self): def test_detect_pt_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt" path = r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -311,7 +344,8 @@ class TestCchardet():
def test_detect_ru_utf8(self): def test_detect_ru_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt" path = r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -319,7 +353,8 @@ class TestCchardet():
def test_detect_ru_windows1251(self): def test_detect_ru_windows1251(self):
encoding = "WINDOWS-1251" encoding = "WINDOWS-1251"
path = r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt" path = r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -327,7 +362,8 @@ class TestCchardet():
def test_detect_ru_ibm855(self): def test_detect_ru_ibm855(self):
encoding = "IBM855" encoding = "IBM855"
path = r"testdata/ru/IBM855/wikitop_ru_IBM855.txt" path = r"testdata/ru/IBM855/wikitop_ru_IBM855.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -335,7 +371,8 @@ class TestCchardet():
def test_detect_ru_koi8r(self): def test_detect_ru_koi8r(self):
encoding = "KOI8-R" encoding = "KOI8-R"
path = r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt" path = r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -343,7 +380,8 @@ class TestCchardet():
def test_detect_se_utf8(self): def test_detect_se_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/se/UTF-8/wikitop_se_UTF-8.txt" path = r"testdata/se/UTF-8/wikitop_se_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -351,7 +389,8 @@ class TestCchardet():
def test_detect_se_windows1252(self): def test_detect_se_windows1252(self):
encoding = "WINDOWS-1252" encoding = "WINDOWS-1252"
path = r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt" path = r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -359,7 +398,8 @@ class TestCchardet():
def test_detect_th_utf8(self): def test_detect_th_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/th/UTF-8/wikitop_th_UTF-8.txt" path = r"testdata/th/UTF-8/wikitop_th_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -367,7 +407,8 @@ class TestCchardet():
def test_detect_th_tis620_1(self): def test_detect_th_tis620_1(self):
encoding = "TIS-620" encoding = "TIS-620"
path = r"testdata/th/TIS-620/utffool_th_TIS-620.txt" path = r"testdata/th/TIS-620/utffool_th_TIS-620.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -375,7 +416,8 @@ class TestCchardet():
def test_detect_th_tis620_2(self): def test_detect_th_tis620_2(self):
encoding = "TIS-620" encoding = "TIS-620"
path = r"testdata/th/TIS-620/wikitop_th_TIS-620.txt" path = r"testdata/th/TIS-620/wikitop_th_TIS-620.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -383,7 +425,8 @@ class TestCchardet():
def test_detect_tr_utf8(self): def test_detect_tr_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt" path = r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -391,7 +434,8 @@ class TestCchardet():
def test_detect_tr_iso88599(self): def test_detect_tr_iso88599(self):
encoding = "ISO-8859-9" encoding = "ISO-8859-9"
path = r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt" path = r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -399,7 +443,8 @@ class TestCchardet():
def test_detect_zh_utf8(self): def test_detect_zh_utf8(self):
encoding = "UTF-8" encoding = "UTF-8"
path = r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt" path = r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -407,7 +452,8 @@ class TestCchardet():
def test_detect_zh_gb18030(self): def test_detect_zh_gb18030(self):
encoding = "GB18030" encoding = "GB18030"
path = r"testdata/zh/GB18030/wikitop_zh_GB18030.txt" path = r"testdata/zh/GB18030/wikitop_zh_GB18030.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
detected_encoding = cchardet.detect(msg) detected_encoding = cchardet.detect(msg)
print(detected_encoding) print(detected_encoding)
eq_(encoding.lower(),detected_encoding['encoding'].lower()) eq_(encoding.lower(),detected_encoding['encoding'].lower())
@ -416,18 +462,21 @@ class TestCchardetSpeed():
def test_speed(self): def test_speed(self):
import chardet import chardet
import time import time
do_times = 5
path = r"testdata/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt" path = r"testdata/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt"
msg =file(path).read() with open(path) as f:
msg = f.read()
# Test chardet # Test chardet
start_chardet = time.time() result_chardet = 0
detected_encoding_chardet = chardet.detect(msg) for i in xrange(do_times):
end_chardet = time.time() start_chardet = time.time()
chardet.detect(msg)
result_chardet += (time.time() - start_chardet)
# Test cchardet # Test cchardet
start_cchardet = time.time() result_cchardet = 0
detected_encoding_cchardet = cchardet.detect(msg) for i in xrange(do_times):
end_cchardet = time.time() start_cchardet = time.time()
# print result cchardet.detect(msg)
result_chardet = end_chardet - start_chardet result_cchardet += (time.time() - start_cchardet)
result_cchardet = end_cchardet - start_cchardet print('chardet:',1/(result_chardet/do_times), 'call(s)/s')
print("chardet:",result_chardet,"detected charset:", detected_encoding_chardet['encoding'].lower(), "confidence:", detected_encoding_chardet['confidence']) print('chardet:',1/(result_cchardet/do_times), 'call(s)/s')
print("cchardet:",result_cchardet,"detected charset:",detected_encoding_cchardet['encoding'].lower(), "confidence:", detected_encoding_cchardet['confidence'])