Merge pull request #16 from moden-py/master

Partially-feed interface
2016-10-17 10:47:17 +09:00 · 2016-10-17 10:47:17 +09:00 · fba4c216ee
commit fba4c216ee
parent 2ac7e55d1f fe77753362
3 changed files with 145 additions and 0 deletions
--- a/src/cchardet/init.py
+++ b/src/cchardet/init.py
@ -17,3 +17,27 @@ def detect(msg):
    if isinstance(encoding, bytes):
        encoding = encoding.decode()
    return {"encoding": encoding, "confidence": confidence}
 class Detector(object):
    """Wrap csd_consider with 'feed' feature."""
    def __init__(self):
        self._detector = _cchardet.Detector()
    def feed(self, data):
        self._detector.feed(data)
    def close(self):
        self._detector.close()
    @property
    def done(self):
        return self._detector.done
    @property
    def result(self):
        encoding, confidence = self._detector.result
        if isinstance(encoding, bytes):
            encoding = encoding.decode()
        return {"encoding": encoding, "confidence": confidence}
--- a/src/cchardet/_cchardet.pyx
+++ b/src/cchardet/_cchardet.pyx
@ -50,3 +50,51 @@ def detect_with_confidence(char *msg):
        return detected_charset, confidence
    else:
        return None, None
 cdef class Detector:
    cdef csd_t csd
    cdef int _done
    cdef int _closed
    cdef float _confidence
    cdef const_char_ptr _detected_charset
    def __init__(self):
        self.csd = csd_open()
        self._done = 0
        self._closed = 0
        self._confidence = 0.0
        self._detected_charset = ''
    def feed(self, char *msg):
        cdef int length
        cdef int result
        if not self.done and not self._closed:
            length = strlen(msg)
            result = csd_consider(self.csd, msg, length)
            if result == -1: # Error, signal with a negative number
                raise Exception("Error, signal with a negative number")
            elif result == 1: # Need more data
                pass
            elif result == 0: # Detected early
                self._done = 1
                self.close()
    def close(self):
        if not self._closed:
            self._detected_charset = csd_close2(self.csd, &self._confidence)
            self._closed = 1
    @property
    def done(self):
        return bool(self._done)
    @property
    def result(self):
        if len(self._detected_charset):
            return self._detected_charset, self._confidence
        else:
            return None, None
--- a/test/tests.py
+++ b/test/tests.py
@ -492,3 +492,76 @@ class TestCchardetSpeed():
            cchardet.detect(msg)
            result_cchardet += (time.time() - start_cchardet)
        print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
 class TestDetector():
    encodings_map = {
    r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt": "ISO-8859-5",
    r"testdata/bg/UTF-8/wikitop_bg_UTF-8.txt": "UTF-8",
    r"testdata/bg/WINDOWS-1251/wikitop_bg_WINDOWS-1251.txt": "WINDOWS-1251",
    r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt": "ISO-8859-2",
    r"testdata/cz/UTF-8/wikitop_cz_UTF-8.txt": "UTF-8",
    r"testdata/de/UTF-8/wikitop_de_UTF-8.txt": "UTF-8",
    r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/dk/UTF-8/wikitop_dk_UTF-8.txt": "UTF-8",
    r"testdata/dk/WINDOWS-1252/wikitop_dk_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt": "ISO-8859-7",
    r"testdata/el/UTF-8/wikitop_el_UTF-8.txt": "UTF-8",
    r"testdata/en/UTF-8/wikitop_en_UTF-8.txt": "UTF-8",
    r"testdata/en/WINDOWS-1252/wikitop_en_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/es/UTF-8/wikitop_es_UTF-8.txt": "UTF-8",
    r"testdata/es/WINDOWS-1252/wikitop_es_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/fi/UTF-8/wikitop_fi_UTF-8.txt": "UTF-8",
    r"testdata/fi/WINDOWS-1252/wikitop_fi_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/fr/UTF-8/wikitop_fr_UTF-8.txt": "UTF-8",
    r"testdata/fr/WINDOWS-1252/wikitop_fr_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/he/UTF-8/wikitop_he_UTF-8.txt": "UTF-8",
    r"testdata/he/WINDOWS-1255/wikitop_he_WINDOWS-1255.txt": "WINDOWS-1255",
    r"testdata/hu/UTF-8/wikitop_hu_UTF-8.txt": "UTF-8",
    r"testdata/hu/ISO-8859-2/wikitop_hu_ISO-8859-2.txt": "ISO-8859-2",
    r"testdata/it/UTF-8/wikitop_it_UTF-8.txt": "UTF-8",
    r"testdata/it/WINDOWS-1252/wikitop_it_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/nl/UTF-8/wikitop_nl_UTF-8.txt": "UTF-8",
    r"testdata/nl/WINDOWS-1252/wikitop_nl_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/no/UTF-8/wikitop_no_UTF-8.txt": "UTF-8",
    r"testdata/no/WINDOWS-1252/wikitop_no_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/pl/UTF-8/wikitop_pl_UTF-8.txt": "UTF-8",
    r"testdata/pl/ISO-8859-2/wikitop_pl_ISO-8859-2.txt": "ISO-8859-2",
    r"testdata/pt/UTF-8/wikitop_pt_UTF-8.txt": "UTF-8",
    r"testdata/pt/WINDOWS-1252/wikitop_pt_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/ru/UTF-8/wikitop_ru_UTF-8.txt": "UTF-8",
    r"testdata/ru/WINDOWS-1251/wikitop_ru_WINDOWS-1251.txt": "WINDOWS-1251",
    r"testdata/ru/IBM855/wikitop_ru_IBM855.txt": "IBM855",
    r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt": "KOI8-R",
    r"testdata/ru/X-MAC-CYRILLIC/wikitop_ru_MACCYRILLIC.txt": "MAC-CYRILLIC",
    r"testdata/se/UTF-8/wikitop_se_UTF-8.txt": "UTF-8",
    r"testdata/se/WINDOWS-1252/wikitop_se_WINDOWS-1252.txt": "WINDOWS-1252",
    r"testdata/th/UTF-8/wikitop_th_UTF-8.txt": "UTF-8",
    r"testdata/th/TIS-620/utffool_th_TIS-620.txt": "TIS-620",
    r"testdata/th/TIS-620/wikitop_th_TIS-620.txt": "TIS-620",
    r"testdata/tr/UTF-8/wikitop_tr_UTF-8.txt": "UTF-8",
    r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt": "ISO-8859-9",
    r"testdata/zh/UTF-8/wikitop_zh_UTF-8.txt": "UTF-8",
    r"testdata/zh/GB18030/wikitop_zh_GB18030.txt": "GB18030",
    }
    def test_detector(self):
        for path, encoding in self.encodings_map.items():
            detector = cchardet.Detector()
            with open(path, 'rb') as f:
                line = f.readline()
                while line:
                    detector.feed(line)
                    if detector.done:
                        break
                    line = f.readline()
            detector.close()
            detected_encoding = detector.result
            eq_(encoding.lower(), detected_encoding['encoding'].lower())
    def test_detector_noresult(self):
        detector = cchardet.Detector()
        detector.feed('0')
        eq_(detector.done, False)
        eq_(detector.result, {"encoding": None, "confidence": None})