From 27f61923be5bf49198f146b6121c81f593351cdb Mon Sep 17 00:00:00 2001 From: PyYoshi Date: Thu, 5 Jul 2012 12:05:11 +0900 Subject: [PATCH] add "cchardet.detect_with_confidence" method. --- readme.md | 2 + setup.py | 3 +- src/cchardet/__init__.py | 22 ++ src/cchardet/cchardet.pyx | 27 +++ src/ext/libcharsetdetect/charsetdetect.cpp | 26 ++- src/ext/libcharsetdetect/charsetdetect.h | 4 +- src/ext/libcharsetdetect/charsetdetectPriv.h | 2 + .../src/base/nsUniversalDetector.cpp | 53 ++++- .../src/base/nsUniversalDetector.h | 2 + test/tests.py | 193 +++++++++--------- 10 files changed, 235 insertions(+), 99 deletions(-) diff --git a/readme.md b/readme.md index 6d97606..51c6b09 100644 --- a/readme.md +++ b/readme.md @@ -61,6 +61,8 @@ import cchardet msg = file(r"test/testdata/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt").read() result = cchardet.detect(msg) print(result) +result2 = cchardet.detect_with_confidence(msg) +print(result2) ``` # Test diff --git a/setup.py b/setup.py index 991e6c5..aea9df6 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,8 @@ setup( long_description= """This library is high speed universal character encoding detector. - binding to charsetdetect. This library is faster than chardet. """, - version = '0.1', + version = '0.2', + license = 'MIT License', classifiers = [ # http://pypi.python.org/pypi?:action=list_classifiers 'Development Status :: 1 - Planning', 'License :: OSI Approved :: MIT License', diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py index 3a29029..1b7fcab 100644 --- a/src/cchardet/__init__.py +++ b/src/cchardet/__init__.py @@ -4,4 +4,26 @@ from cchardet import _cchardet def detect(msg): + """ + Args: + msg: str + Returns: + charset: str + Warnings: + UserWarning + """ return _cchardet.detect(msg) + +def detect_with_confidence(msg): + """ + Args: + msg: str + Returns: + { + "encoding": str, + "confidence": float + } + Warnings: + UserWarning + """ + return _cchardet.detect_with_confidence(msg) \ No newline at end of file diff --git a/src/cchardet/cchardet.pyx b/src/cchardet/cchardet.pyx index 5483d73..f59b138 100644 --- a/src/cchardet/cchardet.pyx +++ b/src/cchardet/cchardet.pyx @@ -17,6 +17,7 @@ cdef extern from "charsetdetect.h": cdef csd_t csd_open() cdef int csd_consider(csd_t csd, char* data, int length) cdef const_char_ptr csd_close(csd_t csd) + cdef const_char_ptr csd_close2(csd_t csd, float *confidence) def detect(char *msg): cdef csd_t csd = csd_open() @@ -30,3 +31,29 @@ def detect(char *msg): return csd_close(csd) elif result == 0: # Detected early return csd_close(csd) + +def detect_with_confidence(char *msg): + cdef csd_t csd = csd_open() + cdef int length = strlen(msg) + cdef int result = csd_consider(csd, msg, length) + cdef float confidence + cdef const_char_ptr detected_charset + # ref: charsetdetect.cpp + if result == -1: # Error, signal with a negative number + raise Exception("Error, signal with a negative number") + elif result == 1: # Need more data + warnings.warn("Need more data",UserWarning) + detected_charset = csd_close2(csd, &confidence) + ret = { + "encoding":detected_charset, + "confidence":confidence + } + return ret + elif result == 0: # Detected early + detected_charset = csd_close2(csd, &confidence) + ret = { + "encoding":detected_charset, + "confidence":confidence + } + return ret + diff --git a/src/ext/libcharsetdetect/charsetdetect.cpp b/src/ext/libcharsetdetect/charsetdetect.cpp index 7b9de6d..4117652 100644 --- a/src/ext/libcharsetdetect/charsetdetect.cpp +++ b/src/ext/libcharsetdetect/charsetdetect.cpp @@ -48,7 +48,6 @@ const char *Detector::Close(void) { return mDetectedCharset; } - // // C API to the character set detector (we actually export this) // @@ -68,3 +67,28 @@ const char *csd_close(csd_t csd) { delete ((Detector*)csd); return result; } + +///* +const char *Detector::Close2(float *confidence) { + DataEnd2(confidence); + + if (!mDone) { + if (mInputState == eEscAscii) { + return "ibm850"; + } + else if (mInputState == ePureAscii) { + return "ASCII"; + } + + return NULL; + } + + return mDetectedCharset; +} + +const char *csd_close2(csd_t csd,float *confidence) { + const char *result = ((Detector*)csd)->Close2(confidence); + delete ((Detector*)csd); + return result; +} +//*/ \ No newline at end of file diff --git a/src/ext/libcharsetdetect/charsetdetect.h b/src/ext/libcharsetdetect/charsetdetect.h index c93f2e0..6cfd1d0 100644 --- a/src/ext/libcharsetdetect/charsetdetect.h +++ b/src/ext/libcharsetdetect/charsetdetect.h @@ -21,7 +21,9 @@ int csd_consider(csd_t csd, const char *data, int length); // Closes the character set detector and returns the detected character set name as an ASCII string. // Returns NULL if detection failed. const char *csd_close(csd_t csd); - + +const char *csd_close2(csd_t csd,float *confidence); + #ifdef __cplusplus } #endif diff --git a/src/ext/libcharsetdetect/charsetdetectPriv.h b/src/ext/libcharsetdetect/charsetdetectPriv.h index 50a9d55..c3c2ecc 100644 --- a/src/ext/libcharsetdetect/charsetdetectPriv.h +++ b/src/ext/libcharsetdetect/charsetdetectPriv.h @@ -11,6 +11,8 @@ public: Detector(PRUint32 aLanguageFilter) : nsUniversalDetector(aLanguageFilter) {}; int Consider(const char *data, int length); const char *Close(void); + + const char *Close2(float *confidence); protected: void Report(const char* aCharset); const char *mDetectedCharset; diff --git a/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp b/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp index f3c63d2..d9e6bc4 100644 --- a/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp +++ b/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp @@ -228,7 +228,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) return NS_OK; } - //--------------------------------------------------------------------- void nsUniversalDetector::DataEnd() { @@ -278,3 +277,55 @@ void nsUniversalDetector::DataEnd() } return; } + +///* +void nsUniversalDetector::DataEnd2(float *confidence) +{ + if (!mGotData) + { + // we haven't got any data yet, return immediately + // caller program sometimes call DataEnd before anything has been sent to detector + return; + } + + if (mDetectedCharset) + { + mDone = PR_TRUE; + Report(mDetectedCharset); + return; + } + + switch (mInputState) + { + case eHighbyte: + { + float proberConfidence; + float maxProberConfidence = (float)0.0; + PRInt32 maxProber = 0; + + for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) + { + if (mCharSetProbers[i]) + { + proberConfidence = mCharSetProbers[i]->GetConfidence(); + if (proberConfidence > maxProberConfidence) + { + maxProberConfidence = proberConfidence; + maxProber = i; + } + } + } + //do not report anything because we are not confident of it, that's in fact a negative answer + if (maxProberConfidence > MINIMUM_THRESHOLD) + Report(mCharSetProbers[maxProber]->GetCharSetName()); + *confidence = maxProberConfidence; + } + break; + case eEscAscii: + break; + default: + ; + } + return; +} +//*/ \ No newline at end of file diff --git a/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.h b/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.h index 525f722..598b813 100644 --- a/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.h +++ b/src/ext/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.h @@ -68,6 +68,8 @@ public: virtual nsresult HandleData(const char* aBuf, PRUint32 aLen); virtual void DataEnd(void); + virtual void DataEnd2(float *confidence); + protected: virtual void Report(const char* aCharset) = 0; virtual void Reset(); diff --git a/test/tests.py b/test/tests.py index da2b3f0..9a55513 100644 --- a/test/tests.py +++ b/test/tests.py @@ -48,369 +48,369 @@ class TestCchardet(): encoding = "ISO-8859-5" path = r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_bg_utf8(self): encoding = "UTF-8" path = r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_bg_windows1251(self): encoding = "WINDOWS-1251" path = r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_cz_iso88592(self): encoding = "ISO-8859-2" path = r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_cz_utf8(self): encoding = "UTF-8" path = r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_de_utf8(self): encoding = "UTF-8" path = r"testdata/de/UTF-8/wikitop_de_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_de_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_dk_utf8(self): encoding = "UTF-8" path = r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_dk_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_el_iso88597(self): encoding = "ISO-8859-7" path = r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_el_utf8(self): encoding = "UTF-8" path = r"testdata/el/UTF-8/wikitop_el_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_en_utf8(self): encoding = "UTF-8" path = r"testdata/en/UTF-8/wikitop_en_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_en_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_es_utf8(self): encoding = "UTF-8" path = r"testdata/es/UTF-8/wikitop_es_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_es_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_fi_utf8(self): encoding = "UTF-8" path = r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_fi_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_fr_utf8(self): encoding = "UTF-8" path = r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_fr_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_he_utf8(self): encoding = "UTF-8" path = r"testdata/he/UTF-8/wikitop_he_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_he_windows1255(self): encoding = "WINDOWS-1255" path = r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_hu_utf8(self): encoding = "UTF-8" path = r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_hu_iso55892(self): encoding = "ISO-8859-2" path = r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_it_utf8(self): encoding = "UTF-8" path = r"testdata/it/UTF-8/wikitop_it_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_it_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_nl_utf8(self): encoding = "UTF-8" path = r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_nl_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_no_utf8(self): encoding = "UTF-8" path = r"testdata/no/UTF-8/wikitop_no_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_no_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_pl_utf8(self): encoding = "UTF-8" path = r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_pl_iso88592(self): encoding = "ISO-8859-2" path = r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_pt_utf8(self): encoding = "UTF-8" path = r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_pt_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_ru_utf8(self): encoding = "UTF-8" path = r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_ru_windows1251(self): encoding = "WINDOWS-1251" path = r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_ru_ibm855(self): encoding = "IBM855" path = r"testdata/ru/IBM855/wikitop_ru_IBM855.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_ru_koi8r(self): encoding = "KOI8-R" path = r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_se_utf8(self): encoding = "UTF-8" path = r"testdata/se/UTF-8/wikitop_se_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_se_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_th_utf8(self): encoding = "UTF-8" path = r"testdata/th/UTF-8/wikitop_th_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_th_tis620_1(self): encoding = "TIS-620" path = r"testdata/th/TIS-620/utffool_th_TIS-620.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_th_tis620_2(self): encoding = "TIS-620" path = r"testdata/th/TIS-620/wikitop_th_TIS-620.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_tr_utf8(self): encoding = "UTF-8" path = r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_tr_iso88599(self): encoding = "ISO-8859-9" path = r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_zh_utf8(self): encoding = "UTF-8" path = r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) def test_detect_zh_gb18030(self): encoding = "GB18030" path = r"testdata/zh/GB18030/wikitop_zh_GB18030.txt" msg =file(path).read() - detected_encoding = cchardet.detect(msg) + detected_encoding = cchardet.detect_with_confidence(msg) print(detected_encoding) - eq_(encoding.lower(),detected_encoding.lower()) + eq_(encoding.lower(),detected_encoding['encoding'].lower()) class TestCchardetSpeed(): def test_speed(self): @@ -424,10 +424,13 @@ class TestCchardetSpeed(): end_chardet = time.time() # Test cchardet start_cchardet = time.time() - detected_encoding_cchardet = cchardet.detect(msg) + detected_encoding_cchardet = cchardet.detect_with_confidence(msg) end_cchardet = time.time() # print result result_chardet = end_chardet - start_chardet result_cchardet = end_cchardet - start_cchardet - print("chardet:",result_chardet,"detected charset:", detected_encoding_chardet['encoding'].lower()) - print("cchardet:",result_cchardet,"detected charset:",detected_encoding_cchardet.lower()) \ No newline at end of file + print() + print(detected_encoding_cchardet) + print() + print("chardet:",result_chardet,"detected charset:", detected_encoding_chardet['encoding'].lower(), "confidence:", detected_encoding_chardet['confidence']) + print("cchardet:",result_cchardet,"detected charset:",detected_encoding_cchardet['encoding'].lower(), "confidence:", detected_encoding_cchardet['confidence'])