use uchardet api

This commit is contained in:
PyYoshi 2017-03-28 00:35:15 +09:00
parent 1bb9b04fda
commit be0815bc38
2 changed files with 24 additions and 96 deletions

View file

@ -15,28 +15,4 @@ def detect(msg):
encoding, confidence = _cchardet.detect_with_confidence(msg) encoding, confidence = _cchardet.detect_with_confidence(msg)
if isinstance(encoding, bytes): if isinstance(encoding, bytes):
encoding = encoding.decode() encoding = encoding.decode()
return {"encoding": encoding, "confidence": confidence} return { "encoding": encoding, "confidence": confidence }
class Detector(object):
"""Wrap csd_consider with 'feed' feature."""
def __init__(self):
self._detector = _cchardet.Detector()
def feed(self, data):
self._detector.feed(data)
def close(self):
self._detector.close()
@property
def done(self):
return self._detector.done
@property
def result(self):
encoding, confidence = self._detector.result
if isinstance(encoding, bytes):
encoding = encoding.decode()
return {"encoding": encoding, "confidence": confidence}

View file

@ -1,81 +1,33 @@
cdef extern from *: cdef extern from *:
ctypedef char* const_char_ptr "const char*" ctypedef char* const_char_ptr "const char*"
cdef extern from "charsetdetect.h": cdef extern from "uchardet.h":
ctypedef void* csd_t ctypedef void* uchardet_t
cdef csd_t csd_open() cdef uchardet_t uchardet_new()
cdef int csd_consider(csd_t csd, char* data, int length) cdef void uchardet_delete(uchardet_t ud)
cdef const_char_ptr csd_close2(csd_t csd, float *confidence) cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, int length)
cdef void uchardet_data_end(uchardet_t ud)
cdef void uchardet_reset(uchardet_t ud)
cdef const_char_ptr uchardet_get_charset(uchardet_t ud)
cdef float uchardet_get_confidence(uchardet_t ud)
def detect_with_confidence(char *msg): def detect_with_confidence(const_char_ptr msg):
cdef csd_t csd = csd_open()
# すでにカウント済みの長さへアクセス
# strlenでは再度カウントすることになる
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Include/bytesobject.h#L82
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Objects/bytesobject.c#L2490
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Include/object.h#L346
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Objects/bytesobject.c#L2410
cdef int length = len(msg) cdef int length = len(msg)
cdef int result = csd_consider(csd, msg, length) cdef uchardet_t ud = uchardet_new()
cdef float confidence = 0.0
cdef const_char_ptr detected_charset
if result == 1: # Need more data cdef int result = uchardet_handle_data(ud, msg, length)
detected_charset = csd_close2(csd, &confidence) if result != 0:
elif result == 0: # Detected early uchardet_delete(ud)
detected_charset = csd_close2(csd, &confidence) raise Exception("Handle data error")
else: # Error, signal with a negative number
raise Exception("Error, signal with a negative number") uchardet_data_end(ud)
cdef bytes detected_charset = uchardet_get_charset(ud)
cdef float detected_confidence = uchardet_get_confidence(ud)
uchardet_delete(ud)
if detected_charset: if detected_charset:
return detected_charset, confidence return detected_charset, detected_confidence
return None, None
cdef class Detector:
cdef csd_t csd
cdef int _done
cdef int _closed
cdef float _confidence
cdef const_char_ptr _detected_charset
def __init__(self):
self.csd = csd_open()
self._done = 0
self._closed = 0
self._confidence = 0.0
self._detected_charset = ''
def feed(self, char *msg):
cdef int length
cdef int result
if not self.done and not self._closed:
length = len(msg)
result = csd_consider(self.csd, msg, length)
if result == -1: # Error, signal with a negative number
raise Exception("Error, signal with a negative number")
elif result == 1: # Need more data
pass
elif result == 0: # Detected early
self._done = 1
self.close()
def close(self):
if not self._closed:
self._detected_charset = csd_close2(self.csd, &self._confidence)
self._closed = 1
@property
def done(self):
return bool(self._done)
@property
def result(self):
if len(self._detected_charset):
return self._detected_charset, self._confidence
return None, None return None, None