use uchardet api
This commit is contained in:
parent
1bb9b04fda
commit
be0815bc38
2 changed files with 24 additions and 96 deletions
|
@ -15,28 +15,4 @@ def detect(msg):
|
||||||
encoding, confidence = _cchardet.detect_with_confidence(msg)
|
encoding, confidence = _cchardet.detect_with_confidence(msg)
|
||||||
if isinstance(encoding, bytes):
|
if isinstance(encoding, bytes):
|
||||||
encoding = encoding.decode()
|
encoding = encoding.decode()
|
||||||
return {"encoding": encoding, "confidence": confidence}
|
return { "encoding": encoding, "confidence": confidence }
|
||||||
|
|
||||||
|
|
||||||
class Detector(object):
|
|
||||||
"""Wrap csd_consider with 'feed' feature."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self._detector = _cchardet.Detector()
|
|
||||||
|
|
||||||
def feed(self, data):
|
|
||||||
self._detector.feed(data)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self._detector.close()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def done(self):
|
|
||||||
return self._detector.done
|
|
||||||
|
|
||||||
@property
|
|
||||||
def result(self):
|
|
||||||
encoding, confidence = self._detector.result
|
|
||||||
if isinstance(encoding, bytes):
|
|
||||||
encoding = encoding.decode()
|
|
||||||
return {"encoding": encoding, "confidence": confidence}
|
|
||||||
|
|
|
@ -1,81 +1,33 @@
|
||||||
cdef extern from *:
|
cdef extern from *:
|
||||||
ctypedef char* const_char_ptr "const char*"
|
ctypedef char* const_char_ptr "const char*"
|
||||||
|
|
||||||
cdef extern from "charsetdetect.h":
|
cdef extern from "uchardet.h":
|
||||||
ctypedef void* csd_t
|
ctypedef void* uchardet_t
|
||||||
cdef csd_t csd_open()
|
cdef uchardet_t uchardet_new()
|
||||||
cdef int csd_consider(csd_t csd, char* data, int length)
|
cdef void uchardet_delete(uchardet_t ud)
|
||||||
cdef const_char_ptr csd_close2(csd_t csd, float *confidence)
|
cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, int length)
|
||||||
|
cdef void uchardet_data_end(uchardet_t ud)
|
||||||
|
cdef void uchardet_reset(uchardet_t ud)
|
||||||
|
cdef const_char_ptr uchardet_get_charset(uchardet_t ud)
|
||||||
|
cdef float uchardet_get_confidence(uchardet_t ud)
|
||||||
|
|
||||||
def detect_with_confidence(char *msg):
|
def detect_with_confidence(const_char_ptr msg):
|
||||||
cdef csd_t csd = csd_open()
|
|
||||||
|
|
||||||
# すでにカウント済みの長さへアクセス
|
|
||||||
# strlenでは再度カウントすることになる
|
|
||||||
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Include/bytesobject.h#L82
|
|
||||||
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Objects/bytesobject.c#L2490
|
|
||||||
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Include/object.h#L346
|
|
||||||
# https://github.com/python/cpython/blob/c30098c8c6014f3340a369a31df9c74bdbacc269/Objects/bytesobject.c#L2410
|
|
||||||
cdef int length = len(msg)
|
cdef int length = len(msg)
|
||||||
|
|
||||||
cdef int result = csd_consider(csd, msg, length)
|
cdef uchardet_t ud = uchardet_new()
|
||||||
cdef float confidence = 0.0
|
|
||||||
cdef const_char_ptr detected_charset
|
|
||||||
|
|
||||||
if result == 1: # Need more data
|
cdef int result = uchardet_handle_data(ud, msg, length)
|
||||||
detected_charset = csd_close2(csd, &confidence)
|
if result != 0:
|
||||||
elif result == 0: # Detected early
|
uchardet_delete(ud)
|
||||||
detected_charset = csd_close2(csd, &confidence)
|
raise Exception("Handle data error")
|
||||||
else: # Error, signal with a negative number
|
|
||||||
raise Exception("Error, signal with a negative number")
|
uchardet_data_end(ud)
|
||||||
|
|
||||||
|
cdef bytes detected_charset = uchardet_get_charset(ud)
|
||||||
|
cdef float detected_confidence = uchardet_get_confidence(ud)
|
||||||
|
uchardet_delete(ud)
|
||||||
|
|
||||||
if detected_charset:
|
if detected_charset:
|
||||||
return detected_charset, confidence
|
return detected_charset, detected_confidence
|
||||||
|
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
cdef class Detector:
|
|
||||||
cdef csd_t csd
|
|
||||||
cdef int _done
|
|
||||||
cdef int _closed
|
|
||||||
cdef float _confidence
|
|
||||||
cdef const_char_ptr _detected_charset
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.csd = csd_open()
|
|
||||||
self._done = 0
|
|
||||||
self._closed = 0
|
|
||||||
self._confidence = 0.0
|
|
||||||
self._detected_charset = ''
|
|
||||||
|
|
||||||
def feed(self, char *msg):
|
|
||||||
cdef int length
|
|
||||||
cdef int result
|
|
||||||
|
|
||||||
if not self.done and not self._closed:
|
|
||||||
length = len(msg)
|
|
||||||
result = csd_consider(self.csd, msg, length)
|
|
||||||
|
|
||||||
if result == -1: # Error, signal with a negative number
|
|
||||||
raise Exception("Error, signal with a negative number")
|
|
||||||
|
|
||||||
elif result == 1: # Need more data
|
|
||||||
pass
|
|
||||||
|
|
||||||
elif result == 0: # Detected early
|
|
||||||
self._done = 1
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if not self._closed:
|
|
||||||
self._detected_charset = csd_close2(self.csd, &self._confidence)
|
|
||||||
self._closed = 1
|
|
||||||
|
|
||||||
@property
|
|
||||||
def done(self):
|
|
||||||
return bool(self._done)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def result(self):
|
|
||||||
if len(self._detected_charset):
|
|
||||||
return self._detected_charset, self._confidence
|
|
||||||
return None, None
|
|
||||||
|
|
Loading…
Reference in a new issue