diff --git a/.gitignore b/.gitignore index 71dbe6d..83986f0 100644 --- a/.gitignore +++ b/.gitignore @@ -32,7 +32,9 @@ nosetests.xml # PyGooglePlus .idea/ ext__ -*.c +ext/libcharsetdetect/mozilla_ +src/cchardet/*.c +src/cchardet/*.cpp libcharsetdetect.dll charsetdetect.h MANIFEST \ No newline at end of file diff --git a/ext/libcharsetdetect/trchardet.c b/ext/libcharsetdetect/trchardet.c new file mode 100644 index 0000000..aba0b97 --- /dev/null +++ b/ext/libcharsetdetect/trchardet.c @@ -0,0 +1,38 @@ +#include "charsetdetect.h" +#include "stdio.h" + +#define BUFFER_SIZE 100*1024 + +int main(int argc, const char * argv[]) { + csd_t csd = csd_open(); + if (csd == (csd_t)-1) { + printf("csd_open failed\n"); + return 1; + } + + int size; + char buf[BUFFER_SIZE] = {0}; + + while ((size = fread(buf, 1, sizeof(buf), stdin)) != 0) { + printf("CLIENT SENDING More data\n"); + int result = csd_consider(csd, buf, size); + if (result < 0) { + printf("csd_consider failed\n"); + return 3; + } else if (result == 0) { + // Already have enough data + break; + } + // Only send one buffer actually, for testing + break; + } + + const char *result = csd_close(csd); + if (result == NULL) { + printf("Unknown character set\n"); + return 2; + } else { + printf("%s\n", result); + return 0; + } +} diff --git a/setup2.py b/setup2.py new file mode 100644 index 0000000..75ab9b0 --- /dev/null +++ b/setup2.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# coding: utf-8 + +import ez_setup +ez_setup.use_setuptools() +import os,sys,platform,shutil +import subprocess +from setuptools import setup, Extension +import distutils.spawn as ds +from Cython.Distutils import build_ext + +root = os.getcwd() +ext_dir = os.path.join(root,'ext_') +src_dir = os.path.join(root,'src') +build_dir = os.path.join(root,'build') +cchardet_dir = os.path.join(src_dir,'cchardet') +cchardet_source = os.path.join(cchardet_dir,"cchardet2.pyx") +charsetdetect_dir = os.path.join(ext_dir, 'libcharsetdetect') +charsetdetect_build_dir = os.path.join(charsetdetect_dir,'build') + + +cchardet_module = Extension("_cchardet", + sources = [cchardet_source], + #libraries = ['charsetdetect'], + #include_dirs = [charsetdetect_dir], + #library_dirs = [charsetdetect_build_dir], + language = "c", +) + +setup( + name = 'cchardet', + author= 'PyYoshi', + url = r"https://github.com/PyYoshi/cChardet", + description = 'Universal encoding detector', + long_description= """This library is high speed universal character encoding detector. - binding to charsetdetect. +This library is faster than chardet. +""", + version = '0.1', + classifiers = [ # http://pypi.python.org/pypi?:action=list_classifiers + 'Development Status :: 1 - Planning', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Cython', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries', + ], + keywords = [ + 'cython', + 'chardet', + 'universal character encoding detector', + 'charsetdetect' + ], + ext_package='cchardet', + package_dir = {'':src_dir}, + packages = ['cchardet'], + cmdclass = {'build_ext': build_ext}, + ext_modules = [ + cchardet_module + ], +) \ No newline at end of file diff --git a/src/cchardet/cchardet2.pyx b/src/cchardet/cchardet2.pyx new file mode 100644 index 0000000..13b35fc --- /dev/null +++ b/src/cchardet/cchardet2.pyx @@ -0,0 +1,109 @@ +# coding:utf8 + +from libc.stdlib cimport malloc, free + +cimport prtypes, src.cchardet.nscore + +cdef extern from *: + ctypedef char* const_char_ptr "const char*" + +cdef extern from "nsUniversalDetector.h": + cdef cppclass nsCharSetProber + cdef enum: + NUM_OF_CHARSET_PROBERS = 3 + cdef enum nsInputState: + ePureAscii = 0 + eEscAscii = 1 + eHighbyte = 2 + + cdef unsigned int NS_FILTER_CHINESE_SIMPLIFIED = 0x01 + cdef unsigned int NS_FILTER_CHINESE_TRADITIONAL = 0x02 + cdef unsigned int NS_FILTER_JAPANESE = 0x04 + cdef unsigned int NS_FILTER_KOREAN = 0x08 + cdef unsigned int NS_FILTER_NON_CJK = 0x10 + cdef unsigned int NS_FILTER_ALL = 0x1F + cdef unsigned int NS_FILTER_CHINESE = NS_FILTER_CHINESE_SIMPLIFIED | NS_FILTER_CHINESE_TRADITIONAL | NS_FILTER_JAPANESE | NS_FILTER_KOREAN + + cdef class nsUniversalDetector: + cdef nsUniversalDetector(self, PRUint32 aLanguageFilter) + cdef nsresult HandleData(self, const_char_ptr aBuf, PRUint32 aLen) + cdef void DataEnd(self,) + + cdef void _Report(self,const_char_ptr aCharset) + cdef void _Reset(self) + cdef nsInputState _mInputState + cdef PRBool _mDone + cdef PRBool _mInTag + cdef PRBool _mStart + cdef PRBool _mGotData + cdef char _mLastChar + cdef const_char_ptr _mDetectedCharset + cdef PRUInt32 _mBestGuess + cdef PRUint32 _mLanguageFilter + + cdef nsCharSetProber *_mCharsetProber[NUM_OF_CHARSET_PROBERS] + cdef nsCharSetProber *_mEscCharSetProber + +""" +cdef extern from *: + cdef class Detector(nsUniversalDetector): + cdef Detector(self, PRUint32 aLanguageFilter): + nsUniversalDetector(self, aLanguageFilter) + cdef int Consider(self, const_char_ptr data, int length) + cdef const_char_ptr Close(self, ) + + cdef void _Report(self, const_char_ptr aCharset) + cdef const_char_ptr *_mDetectedCharset""" + +cdef class Detector(nsUniversalDetector): + cdef Detector(self, PRUint32 aLanguageFilter): + nsUniversalDetector(self, aLanguageFilter) + + cdef void Report(self, const_char_ptr aCharset): + self._mDone = PR_TRUE + self._mDetectedCharset = aCharset + + cdef int Consider(self, const_char_ptr data, int length): + if HandleData(data,length) == NS_ERROR_OUT_OF_MEMORY: + # Error, signal with a negative number + return -1 + + if self._mDone: + # Detected early + return 0 + + # Need more data! + return 1 + + cdef const_char_ptr Close(self): + self.DataEnd() + + if not self._mDone: + if self._mInputState == eEscAscii: + return "ibm850" + elif self._mInputState == ePureAscii: + return "ASCII" + + return None + + return self._mDetectedCharset + +cdef extern from *: + ctypedef void* csd_t + cdef csd_t csd_open() + cdef int csd_consider(csd_t csd, char* data, int length) + cdef const_char_ptr csd_close(csd_t csd) + +cdef csd_t csd_open(): + # TODO: capture exceptions thrown by "new" and return -1 in that case + # TODO: provide C-land with access to the language filter constructor argument + return Detector(NS_FILTER_ALL) + +cdef int csd_consider(csd_t csd, const_char_ptr data, int length): + # return ((Detector*)csd)->Consider(data, length); + return csd.Consider(data, length) + +cdef const_char_ptr csd_close(csd_t csd): + cdef const_char_ptr result = csd.Close() + del csd + return result \ No newline at end of file diff --git a/src/cchardet/nscore.pxd b/src/cchardet/nscore.pxd new file mode 100644 index 0000000..00b48de --- /dev/null +++ b/src/cchardet/nscore.pxd @@ -0,0 +1,9 @@ +# coding:utf8 + +cdef extern from "nscore.h": + # base: https://github.com/kmshi/miro/blob/5d7cdd679830169590a677632cd88a2fa27f81f5/tv/windows/plat/frontends/widgets/XULRunnerBrowser/xulrunnerbrowser.pyx + ctypedef PRUint32 nsresult + ctypedef PRUint32 PRBool + cdef enum: + NS_OK = 0 + cdef PRUint32 NS_ERROR_OUT_OF_MEMORY = 0x8007000eL \ No newline at end of file diff --git a/src/cchardet/prtypes.pxd b/src/cchardet/prtypes.pxd new file mode 100644 index 0000000..74d8dee --- /dev/null +++ b/src/cchardet/prtypes.pxd @@ -0,0 +1,9 @@ +# coding:utf8 + +cdef extern from "prtypes.h": + ctypedef unsigned int PRUint32 + ctypedef int PRIntn + ctypedef PRIntn PRBool + cdef enum: + PR_TRUE = 1 + PR_FALSE = 0 \ No newline at end of file diff --git a/src/cchardet/python.pxd b/src/cchardet/python.pxd new file mode 100644 index 0000000..7e2b21d --- /dev/null +++ b/src/cchardet/python.pxd @@ -0,0 +1,5 @@ +# coding:utf8 + +cdef extern from "Python.h": + void * PyMem_Malloc(size_t) + void PyMem_Free(void *) \ No newline at end of file diff --git a/src/cchardet/string.pxd b/src/cchardet/string.pxd new file mode 100644 index 0000000..72594f8 --- /dev/null +++ b/src/cchardet/string.pxd @@ -0,0 +1,4 @@ +# coding:utf8 + +cdef extern from "string.h": + cdef int strlen(char *s) \ No newline at end of file