This commit is contained in:
PyYoshi 2012-06-26 09:50:30 +09:00
parent b68dbdaf9d
commit cf82f75bb3
8 changed files with 236 additions and 1 deletions

4
.gitignore vendored
View file

@ -32,7 +32,9 @@ nosetests.xml
# PyGooglePlus
.idea/
ext__
*.c
ext/libcharsetdetect/mozilla_
src/cchardet/*.c
src/cchardet/*.cpp
libcharsetdetect.dll
charsetdetect.h
MANIFEST

View file

@ -0,0 +1,38 @@
#include "charsetdetect.h"
#include "stdio.h"
#define BUFFER_SIZE 100*1024
int main(int argc, const char * argv[]) {
csd_t csd = csd_open();
if (csd == (csd_t)-1) {
printf("csd_open failed\n");
return 1;
}
int size;
char buf[BUFFER_SIZE] = {0};
while ((size = fread(buf, 1, sizeof(buf), stdin)) != 0) {
printf("CLIENT SENDING More data\n");
int result = csd_consider(csd, buf, size);
if (result < 0) {
printf("csd_consider failed\n");
return 3;
} else if (result == 0) {
// Already have enough data
break;
}
// Only send one buffer actually, for testing
break;
}
const char *result = csd_close(csd);
if (result == NULL) {
printf("Unknown character set\n");
return 2;
} else {
printf("%s\n", result);
return 0;
}
}

59
setup2.py Normal file
View file

@ -0,0 +1,59 @@
#!/usr/bin/env python
# coding: utf-8
import ez_setup
ez_setup.use_setuptools()
import os,sys,platform,shutil
import subprocess
from setuptools import setup, Extension
import distutils.spawn as ds
from Cython.Distutils import build_ext
root = os.getcwd()
ext_dir = os.path.join(root,'ext_')
src_dir = os.path.join(root,'src')
build_dir = os.path.join(root,'build')
cchardet_dir = os.path.join(src_dir,'cchardet')
cchardet_source = os.path.join(cchardet_dir,"cchardet2.pyx")
charsetdetect_dir = os.path.join(ext_dir, 'libcharsetdetect')
charsetdetect_build_dir = os.path.join(charsetdetect_dir,'build')
cchardet_module = Extension("_cchardet",
sources = [cchardet_source],
#libraries = ['charsetdetect'],
#include_dirs = [charsetdetect_dir],
#library_dirs = [charsetdetect_build_dir],
language = "c",
)
setup(
name = 'cchardet',
author= 'PyYoshi',
url = r"https://github.com/PyYoshi/cChardet",
description = 'Universal encoding detector',
long_description= """This library is high speed universal character encoding detector. - binding to charsetdetect.
This library is faster than chardet.
""",
version = '0.1',
classifiers = [ # http://pypi.python.org/pypi?:action=list_classifiers
'Development Status :: 1 - Planning',
'License :: OSI Approved :: MIT License',
'Programming Language :: Cython',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries',
],
keywords = [
'cython',
'chardet',
'universal character encoding detector',
'charsetdetect'
],
ext_package='cchardet',
package_dir = {'':src_dir},
packages = ['cchardet'],
cmdclass = {'build_ext': build_ext},
ext_modules = [
cchardet_module
],
)

109
src/cchardet/cchardet2.pyx Normal file
View file

@ -0,0 +1,109 @@
# coding:utf8
from libc.stdlib cimport malloc, free
cimport prtypes, src.cchardet.nscore
cdef extern from *:
ctypedef char* const_char_ptr "const char*"
cdef extern from "nsUniversalDetector.h":
cdef cppclass nsCharSetProber
cdef enum:
NUM_OF_CHARSET_PROBERS = 3
cdef enum nsInputState:
ePureAscii = 0
eEscAscii = 1
eHighbyte = 2
cdef unsigned int NS_FILTER_CHINESE_SIMPLIFIED = 0x01
cdef unsigned int NS_FILTER_CHINESE_TRADITIONAL = 0x02
cdef unsigned int NS_FILTER_JAPANESE = 0x04
cdef unsigned int NS_FILTER_KOREAN = 0x08
cdef unsigned int NS_FILTER_NON_CJK = 0x10
cdef unsigned int NS_FILTER_ALL = 0x1F
cdef unsigned int NS_FILTER_CHINESE = NS_FILTER_CHINESE_SIMPLIFIED | NS_FILTER_CHINESE_TRADITIONAL | NS_FILTER_JAPANESE | NS_FILTER_KOREAN
cdef class nsUniversalDetector:
cdef nsUniversalDetector(self, PRUint32 aLanguageFilter)
cdef nsresult HandleData(self, const_char_ptr aBuf, PRUint32 aLen)
cdef void DataEnd(self,)
cdef void _Report(self,const_char_ptr aCharset)
cdef void _Reset(self)
cdef nsInputState _mInputState
cdef PRBool _mDone
cdef PRBool _mInTag
cdef PRBool _mStart
cdef PRBool _mGotData
cdef char _mLastChar
cdef const_char_ptr _mDetectedCharset
cdef PRUInt32 _mBestGuess
cdef PRUint32 _mLanguageFilter
cdef nsCharSetProber *_mCharsetProber[NUM_OF_CHARSET_PROBERS]
cdef nsCharSetProber *_mEscCharSetProber
"""
cdef extern from *:
cdef class Detector(nsUniversalDetector):
cdef Detector(self, PRUint32 aLanguageFilter):
nsUniversalDetector(self, aLanguageFilter)
cdef int Consider(self, const_char_ptr data, int length)
cdef const_char_ptr Close(self, )
cdef void _Report(self, const_char_ptr aCharset)
cdef const_char_ptr *_mDetectedCharset"""
cdef class Detector(nsUniversalDetector):
cdef Detector(self, PRUint32 aLanguageFilter):
nsUniversalDetector(self, aLanguageFilter)
cdef void Report(self, const_char_ptr aCharset):
self._mDone = PR_TRUE
self._mDetectedCharset = aCharset
cdef int Consider(self, const_char_ptr data, int length):
if HandleData(data,length) == NS_ERROR_OUT_OF_MEMORY:
# Error, signal with a negative number
return -1
if self._mDone:
# Detected early
return 0
# Need more data!
return 1
cdef const_char_ptr Close(self):
self.DataEnd()
if not self._mDone:
if self._mInputState == eEscAscii:
return "ibm850"
elif self._mInputState == ePureAscii:
return "ASCII"
return None
return self._mDetectedCharset
cdef extern from *:
ctypedef void* csd_t
cdef csd_t csd_open()
cdef int csd_consider(csd_t csd, char* data, int length)
cdef const_char_ptr csd_close(csd_t csd)
cdef csd_t csd_open():
# TODO: capture exceptions thrown by "new" and return -1 in that case
# TODO: provide C-land with access to the language filter constructor argument
return Detector(NS_FILTER_ALL)
cdef int csd_consider(csd_t csd, const_char_ptr data, int length):
# return ((Detector*)csd)->Consider(data, length);
return <Detector*>csd.Consider(data, length)
cdef const_char_ptr csd_close(csd_t csd):
cdef const_char_ptr result = <Detector*>csd.Close()
del <Detector*>csd
return result

9
src/cchardet/nscore.pxd Normal file
View file

@ -0,0 +1,9 @@
# coding:utf8
cdef extern from "nscore.h":
# base: https://github.com/kmshi/miro/blob/5d7cdd679830169590a677632cd88a2fa27f81f5/tv/windows/plat/frontends/widgets/XULRunnerBrowser/xulrunnerbrowser.pyx
ctypedef PRUint32 nsresult
ctypedef PRUint32 PRBool
cdef enum:
NS_OK = 0
cdef PRUint32 NS_ERROR_OUT_OF_MEMORY = <nsresult>0x8007000eL

9
src/cchardet/prtypes.pxd Normal file
View file

@ -0,0 +1,9 @@
# coding:utf8
cdef extern from "prtypes.h":
ctypedef unsigned int PRUint32
ctypedef int PRIntn
ctypedef PRIntn PRBool
cdef enum:
PR_TRUE = 1
PR_FALSE = 0

5
src/cchardet/python.pxd Normal file
View file

@ -0,0 +1,5 @@
# coding:utf8
cdef extern from "Python.h":
void * PyMem_Malloc(size_t)
void PyMem_Free(void *)

4
src/cchardet/string.pxd Normal file
View file

@ -0,0 +1,4 @@
# coding:utf8
cdef extern from "string.h":
cdef int strlen(char *s)