This commit is contained in:
PyYoshi 2012-06-26 10:02:49 +09:00
parent dbf1b9c316
commit 93aed088f2
8 changed files with 2 additions and 200 deletions

1
.gitignore vendored
View file

@ -34,6 +34,7 @@ nosetests.xml
ext__
ext/libcharsetdetect/mozilla_
src/cchardet/*.c
src/cchardet/*.o
src/cchardet/*.cpp
libcharsetdetect.dll
charsetdetect.h

View file

@ -24,7 +24,3 @@ CMakeCache.txt
CMakeFiles/
cmake_install.cmake
install_manifest.txt
# prebuild junk
configure.bat
exec_make.bat

View file

@ -1,59 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
import ez_setup
ez_setup.use_setuptools()
import os,sys,platform,shutil
import subprocess
from setuptools import setup, Extension
import distutils.spawn as ds
from Cython.Distutils import build_ext
root = os.getcwd()
ext_dir = os.path.join(root,'ext_')
src_dir = os.path.join(root,'src')
build_dir = os.path.join(root,'build')
cchardet_dir = os.path.join(src_dir,'cchardet')
cchardet_source = os.path.join(cchardet_dir,"cchardet2.pyx")
charsetdetect_dir = os.path.join(ext_dir, 'libcharsetdetect')
charsetdetect_build_dir = os.path.join(charsetdetect_dir,'build')
cchardet_module = Extension("_cchardet",
sources = [cchardet_source],
#libraries = ['charsetdetect'],
#include_dirs = [charsetdetect_dir],
#library_dirs = [charsetdetect_build_dir],
language = "c",
)
setup(
name = 'cchardet',
author= 'PyYoshi',
url = r"https://github.com/PyYoshi/cChardet",
description = 'Universal encoding detector',
long_description= """This library is high speed universal character encoding detector. - binding to charsetdetect.
This library is faster than chardet.
""",
version = '0.1',
classifiers = [ # http://pypi.python.org/pypi?:action=list_classifiers
'Development Status :: 1 - Planning',
'License :: OSI Approved :: MIT License',
'Programming Language :: Cython',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries',
],
keywords = [
'cython',
'chardet',
'universal character encoding detector',
'charsetdetect'
],
ext_package='cchardet',
package_dir = {'':src_dir},
packages = ['cchardet'],
cmdclass = {'build_ext': build_ext},
ext_modules = [
cchardet_module
],
)

View file

@ -1,109 +0,0 @@
# coding:utf8
from libc.stdlib cimport malloc, free
cimport prtypes, src.cchardet.nscore
cdef extern from *:
ctypedef char* const_char_ptr "const char*"
cdef extern from "nsUniversalDetector.h":
cdef cppclass nsCharSetProber
cdef enum:
NUM_OF_CHARSET_PROBERS = 3
cdef enum nsInputState:
ePureAscii = 0
eEscAscii = 1
eHighbyte = 2
cdef unsigned int NS_FILTER_CHINESE_SIMPLIFIED = 0x01
cdef unsigned int NS_FILTER_CHINESE_TRADITIONAL = 0x02
cdef unsigned int NS_FILTER_JAPANESE = 0x04
cdef unsigned int NS_FILTER_KOREAN = 0x08
cdef unsigned int NS_FILTER_NON_CJK = 0x10
cdef unsigned int NS_FILTER_ALL = 0x1F
cdef unsigned int NS_FILTER_CHINESE = NS_FILTER_CHINESE_SIMPLIFIED | NS_FILTER_CHINESE_TRADITIONAL | NS_FILTER_JAPANESE | NS_FILTER_KOREAN
cdef class nsUniversalDetector:
cdef nsUniversalDetector(self, PRUint32 aLanguageFilter)
cdef nsresult HandleData(self, const_char_ptr aBuf, PRUint32 aLen)
cdef void DataEnd(self,)
cdef void _Report(self,const_char_ptr aCharset)
cdef void _Reset(self)
cdef nsInputState _mInputState
cdef PRBool _mDone
cdef PRBool _mInTag
cdef PRBool _mStart
cdef PRBool _mGotData
cdef char _mLastChar
cdef const_char_ptr _mDetectedCharset
cdef PRUInt32 _mBestGuess
cdef PRUint32 _mLanguageFilter
cdef nsCharSetProber *_mCharsetProber[NUM_OF_CHARSET_PROBERS]
cdef nsCharSetProber *_mEscCharSetProber
"""
cdef extern from *:
cdef class Detector(nsUniversalDetector):
cdef Detector(self, PRUint32 aLanguageFilter):
nsUniversalDetector(self, aLanguageFilter)
cdef int Consider(self, const_char_ptr data, int length)
cdef const_char_ptr Close(self, )
cdef void _Report(self, const_char_ptr aCharset)
cdef const_char_ptr *_mDetectedCharset"""
cdef class Detector(nsUniversalDetector):
cdef Detector(self, PRUint32 aLanguageFilter):
nsUniversalDetector(self, aLanguageFilter)
cdef void Report(self, const_char_ptr aCharset):
self._mDone = PR_TRUE
self._mDetectedCharset = aCharset
cdef int Consider(self, const_char_ptr data, int length):
if HandleData(data,length) == NS_ERROR_OUT_OF_MEMORY:
# Error, signal with a negative number
return -1
if self._mDone:
# Detected early
return 0
# Need more data!
return 1
cdef const_char_ptr Close(self):
self.DataEnd()
if not self._mDone:
if self._mInputState == eEscAscii:
return "ibm850"
elif self._mInputState == ePureAscii:
return "ASCII"
return None
return self._mDetectedCharset
cdef extern from *:
ctypedef void* csd_t
cdef csd_t csd_open()
cdef int csd_consider(csd_t csd, char* data, int length)
cdef const_char_ptr csd_close(csd_t csd)
cdef csd_t csd_open():
# TODO: capture exceptions thrown by "new" and return -1 in that case
# TODO: provide C-land with access to the language filter constructor argument
return Detector(NS_FILTER_ALL)
cdef int csd_consider(csd_t csd, const_char_ptr data, int length):
# return ((Detector*)csd)->Consider(data, length);
return <Detector*>csd.Consider(data, length)
cdef const_char_ptr csd_close(csd_t csd):
cdef const_char_ptr result = <Detector*>csd.Close()
del <Detector*>csd
return result

View file

@ -1,9 +0,0 @@
# coding:utf8
cdef extern from "nscore.h":
# base: https://github.com/kmshi/miro/blob/5d7cdd679830169590a677632cd88a2fa27f81f5/tv/windows/plat/frontends/widgets/XULRunnerBrowser/xulrunnerbrowser.pyx
ctypedef PRUint32 nsresult
ctypedef PRUint32 PRBool
cdef enum:
NS_OK = 0
cdef PRUint32 NS_ERROR_OUT_OF_MEMORY = <nsresult>0x8007000eL

View file

@ -1,9 +0,0 @@
# coding:utf8
cdef extern from "prtypes.h":
ctypedef unsigned int PRUint32
ctypedef int PRIntn
ctypedef PRIntn PRBool
cdef enum:
PR_TRUE = 1
PR_FALSE = 0

View file

@ -1,5 +0,0 @@
# coding:utf8
cdef extern from "Python.h":
void * PyMem_Malloc(size_t)
void PyMem_Free(void *)

View file

@ -1,4 +0,0 @@
# coding:utf8
cdef extern from "string.h":
cdef int strlen(char *s)