diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py index 3659a4a..5853777 100644 --- a/src/cchardet/__init__.py +++ b/src/cchardet/__init__.py @@ -15,28 +15,4 @@ def detect(msg): encoding, confidence = _cchardet.detect_with_confidence(msg) if isinstance(encoding, bytes): encoding = encoding.decode() - return {"encoding": encoding, "confidence": confidence} - - -class Detector(object): - """Wrap csd_consider with 'feed' feature.""" - - def __init__(self): - self._detector = _cchardet.Detector() - - def feed(self, data): - self._detector.feed(data) - - def close(self): - self._detector.close() - - @property - def done(self): - return self._detector.done - - @property - def result(self): - encoding, confidence = self._detector.result - if isinstance(encoding, bytes): - encoding = encoding.decode() - return {"encoding": encoding, "confidence": confidence} + return { "encoding": encoding, "confidence": confidence } diff --git a/src/cchardet/_cchardet.pyx b/src/cchardet/_cchardet.pyx index f03f1f9..fe6531e 100644 --- a/src/cchardet/_cchardet.pyx +++ b/src/cchardet/_cchardet.pyx @@ -1,81 +1,33 @@ cdef extern from *: ctypedef char* const_char_ptr "const char*" -cdef extern from "charsetdetect.h": - ctypedef void* csd_t - cdef csd_t csd_open() - cdef int csd_consider(csd_t csd, char* data, int length) - cdef const_char_ptr csd_close2(csd_t csd, float *confidence) +cdef extern from "uchardet.h": + ctypedef void* uchardet_t + cdef uchardet_t uchardet_new() + cdef void uchardet_delete(uchardet_t ud) + cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, int length) + cdef void uchardet_data_end(uchardet_t ud) + cdef void uchardet_reset(uchardet_t ud) + cdef const_char_ptr uchardet_get_charset(uchardet_t ud) + cdef float uchardet_get_confidence(uchardet_t ud) -def detect_with_confidence(char *msg): - cdef csd_t csd = csd_open() - - # すでにカウント済みの長さへアクセス - # strlenでは再度カウントすることになる - # https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Include/bytesobject.h#L82 - # https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Objects/bytesobject.c#L2490 - # https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Include/object.h#L346 - # https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Objects/bytesobject.c#L2410 +def detect_with_confidence(const_char_ptr msg): cdef int length = len(msg) + + cdef uchardet_t ud = uchardet_new() - cdef int result = csd_consider(csd, msg, length) - cdef float confidence = 0.0 - cdef const_char_ptr detected_charset + cdef int result = uchardet_handle_data(ud, msg, length) + if result != 0: + uchardet_delete(ud) + raise Exception("Handle data error") - if result == 1: # Need more data - detected_charset = csd_close2(csd, &confidence) - elif result == 0: # Detected early - detected_charset = csd_close2(csd, &confidence) - else: # Error, signal with a negative number - raise Exception("Error, signal with a negative number") + uchardet_data_end(ud) + + cdef bytes detected_charset = uchardet_get_charset(ud) + cdef float detected_confidence = uchardet_get_confidence(ud) + uchardet_delete(ud) if detected_charset: - return detected_charset, confidence + return detected_charset, detected_confidence + return None, None - -cdef class Detector: - cdef csd_t csd - cdef int _done - cdef int _closed - cdef float _confidence - cdef const_char_ptr _detected_charset - - def __init__(self): - self.csd = csd_open() - self._done = 0 - self._closed = 0 - self._confidence = 0.0 - self._detected_charset = '' - - def feed(self, char *msg): - cdef int length - cdef int result - - if not self.done and not self._closed: - length = len(msg) - result = csd_consider(self.csd, msg, length) - - if result == -1: # Error, signal with a negative number - raise Exception("Error, signal with a negative number") - - elif result == 1: # Need more data - pass - - elif result == 0: # Detected early - self._done = 1 - self.close() - - def close(self): - if not self._closed: - self._detected_charset = csd_close2(self.csd, &self._confidence) - self._closed = 1 - - @property - def done(self): - return bool(self._done) - - @property - def result(self): - if len(self._detected_charset): - return self._detected_charset, self._confidence - return None, None