refresh
This commit is contained in:
parent
dbf1b9c316
commit
93aed088f2
8 changed files with 2 additions and 200 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -34,6 +34,7 @@ nosetests.xml
|
||||||
ext__
|
ext__
|
||||||
ext/libcharsetdetect/mozilla_
|
ext/libcharsetdetect/mozilla_
|
||||||
src/cchardet/*.c
|
src/cchardet/*.c
|
||||||
|
src/cchardet/*.o
|
||||||
src/cchardet/*.cpp
|
src/cchardet/*.cpp
|
||||||
libcharsetdetect.dll
|
libcharsetdetect.dll
|
||||||
charsetdetect.h
|
charsetdetect.h
|
||||||
|
|
4
ext/libcharsetdetect/.gitignore
vendored
4
ext/libcharsetdetect/.gitignore
vendored
|
@ -24,7 +24,3 @@ CMakeCache.txt
|
||||||
CMakeFiles/
|
CMakeFiles/
|
||||||
cmake_install.cmake
|
cmake_install.cmake
|
||||||
install_manifest.txt
|
install_manifest.txt
|
||||||
|
|
||||||
# prebuild junk
|
|
||||||
configure.bat
|
|
||||||
exec_make.bat
|
|
59
setup2.py
59
setup2.py
|
@ -1,59 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
import ez_setup
|
|
||||||
ez_setup.use_setuptools()
|
|
||||||
import os,sys,platform,shutil
|
|
||||||
import subprocess
|
|
||||||
from setuptools import setup, Extension
|
|
||||||
import distutils.spawn as ds
|
|
||||||
from Cython.Distutils import build_ext
|
|
||||||
|
|
||||||
root = os.getcwd()
|
|
||||||
ext_dir = os.path.join(root,'ext_')
|
|
||||||
src_dir = os.path.join(root,'src')
|
|
||||||
build_dir = os.path.join(root,'build')
|
|
||||||
cchardet_dir = os.path.join(src_dir,'cchardet')
|
|
||||||
cchardet_source = os.path.join(cchardet_dir,"cchardet2.pyx")
|
|
||||||
charsetdetect_dir = os.path.join(ext_dir, 'libcharsetdetect')
|
|
||||||
charsetdetect_build_dir = os.path.join(charsetdetect_dir,'build')
|
|
||||||
|
|
||||||
|
|
||||||
cchardet_module = Extension("_cchardet",
|
|
||||||
sources = [cchardet_source],
|
|
||||||
#libraries = ['charsetdetect'],
|
|
||||||
#include_dirs = [charsetdetect_dir],
|
|
||||||
#library_dirs = [charsetdetect_build_dir],
|
|
||||||
language = "c",
|
|
||||||
)
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name = 'cchardet',
|
|
||||||
author= 'PyYoshi',
|
|
||||||
url = r"https://github.com/PyYoshi/cChardet",
|
|
||||||
description = 'Universal encoding detector',
|
|
||||||
long_description= """This library is high speed universal character encoding detector. - binding to charsetdetect.
|
|
||||||
This library is faster than chardet.
|
|
||||||
""",
|
|
||||||
version = '0.1',
|
|
||||||
classifiers = [ # http://pypi.python.org/pypi?:action=list_classifiers
|
|
||||||
'Development Status :: 1 - Planning',
|
|
||||||
'License :: OSI Approved :: MIT License',
|
|
||||||
'Programming Language :: Cython',
|
|
||||||
'Programming Language :: Python',
|
|
||||||
'Topic :: Software Development :: Libraries',
|
|
||||||
],
|
|
||||||
keywords = [
|
|
||||||
'cython',
|
|
||||||
'chardet',
|
|
||||||
'universal character encoding detector',
|
|
||||||
'charsetdetect'
|
|
||||||
],
|
|
||||||
ext_package='cchardet',
|
|
||||||
package_dir = {'':src_dir},
|
|
||||||
packages = ['cchardet'],
|
|
||||||
cmdclass = {'build_ext': build_ext},
|
|
||||||
ext_modules = [
|
|
||||||
cchardet_module
|
|
||||||
],
|
|
||||||
)
|
|
|
@ -1,109 +0,0 @@
|
||||||
# coding:utf8
|
|
||||||
|
|
||||||
from libc.stdlib cimport malloc, free
|
|
||||||
|
|
||||||
cimport prtypes, src.cchardet.nscore
|
|
||||||
|
|
||||||
cdef extern from *:
|
|
||||||
ctypedef char* const_char_ptr "const char*"
|
|
||||||
|
|
||||||
cdef extern from "nsUniversalDetector.h":
|
|
||||||
cdef cppclass nsCharSetProber
|
|
||||||
cdef enum:
|
|
||||||
NUM_OF_CHARSET_PROBERS = 3
|
|
||||||
cdef enum nsInputState:
|
|
||||||
ePureAscii = 0
|
|
||||||
eEscAscii = 1
|
|
||||||
eHighbyte = 2
|
|
||||||
|
|
||||||
cdef unsigned int NS_FILTER_CHINESE_SIMPLIFIED = 0x01
|
|
||||||
cdef unsigned int NS_FILTER_CHINESE_TRADITIONAL = 0x02
|
|
||||||
cdef unsigned int NS_FILTER_JAPANESE = 0x04
|
|
||||||
cdef unsigned int NS_FILTER_KOREAN = 0x08
|
|
||||||
cdef unsigned int NS_FILTER_NON_CJK = 0x10
|
|
||||||
cdef unsigned int NS_FILTER_ALL = 0x1F
|
|
||||||
cdef unsigned int NS_FILTER_CHINESE = NS_FILTER_CHINESE_SIMPLIFIED | NS_FILTER_CHINESE_TRADITIONAL | NS_FILTER_JAPANESE | NS_FILTER_KOREAN
|
|
||||||
|
|
||||||
cdef class nsUniversalDetector:
|
|
||||||
cdef nsUniversalDetector(self, PRUint32 aLanguageFilter)
|
|
||||||
cdef nsresult HandleData(self, const_char_ptr aBuf, PRUint32 aLen)
|
|
||||||
cdef void DataEnd(self,)
|
|
||||||
|
|
||||||
cdef void _Report(self,const_char_ptr aCharset)
|
|
||||||
cdef void _Reset(self)
|
|
||||||
cdef nsInputState _mInputState
|
|
||||||
cdef PRBool _mDone
|
|
||||||
cdef PRBool _mInTag
|
|
||||||
cdef PRBool _mStart
|
|
||||||
cdef PRBool _mGotData
|
|
||||||
cdef char _mLastChar
|
|
||||||
cdef const_char_ptr _mDetectedCharset
|
|
||||||
cdef PRUInt32 _mBestGuess
|
|
||||||
cdef PRUint32 _mLanguageFilter
|
|
||||||
|
|
||||||
cdef nsCharSetProber *_mCharsetProber[NUM_OF_CHARSET_PROBERS]
|
|
||||||
cdef nsCharSetProber *_mEscCharSetProber
|
|
||||||
|
|
||||||
"""
|
|
||||||
cdef extern from *:
|
|
||||||
cdef class Detector(nsUniversalDetector):
|
|
||||||
cdef Detector(self, PRUint32 aLanguageFilter):
|
|
||||||
nsUniversalDetector(self, aLanguageFilter)
|
|
||||||
cdef int Consider(self, const_char_ptr data, int length)
|
|
||||||
cdef const_char_ptr Close(self, )
|
|
||||||
|
|
||||||
cdef void _Report(self, const_char_ptr aCharset)
|
|
||||||
cdef const_char_ptr *_mDetectedCharset"""
|
|
||||||
|
|
||||||
cdef class Detector(nsUniversalDetector):
|
|
||||||
cdef Detector(self, PRUint32 aLanguageFilter):
|
|
||||||
nsUniversalDetector(self, aLanguageFilter)
|
|
||||||
|
|
||||||
cdef void Report(self, const_char_ptr aCharset):
|
|
||||||
self._mDone = PR_TRUE
|
|
||||||
self._mDetectedCharset = aCharset
|
|
||||||
|
|
||||||
cdef int Consider(self, const_char_ptr data, int length):
|
|
||||||
if HandleData(data,length) == NS_ERROR_OUT_OF_MEMORY:
|
|
||||||
# Error, signal with a negative number
|
|
||||||
return -1
|
|
||||||
|
|
||||||
if self._mDone:
|
|
||||||
# Detected early
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# Need more data!
|
|
||||||
return 1
|
|
||||||
|
|
||||||
cdef const_char_ptr Close(self):
|
|
||||||
self.DataEnd()
|
|
||||||
|
|
||||||
if not self._mDone:
|
|
||||||
if self._mInputState == eEscAscii:
|
|
||||||
return "ibm850"
|
|
||||||
elif self._mInputState == ePureAscii:
|
|
||||||
return "ASCII"
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
return self._mDetectedCharset
|
|
||||||
|
|
||||||
cdef extern from *:
|
|
||||||
ctypedef void* csd_t
|
|
||||||
cdef csd_t csd_open()
|
|
||||||
cdef int csd_consider(csd_t csd, char* data, int length)
|
|
||||||
cdef const_char_ptr csd_close(csd_t csd)
|
|
||||||
|
|
||||||
cdef csd_t csd_open():
|
|
||||||
# TODO: capture exceptions thrown by "new" and return -1 in that case
|
|
||||||
# TODO: provide C-land with access to the language filter constructor argument
|
|
||||||
return Detector(NS_FILTER_ALL)
|
|
||||||
|
|
||||||
cdef int csd_consider(csd_t csd, const_char_ptr data, int length):
|
|
||||||
# return ((Detector*)csd)->Consider(data, length);
|
|
||||||
return <Detector*>csd.Consider(data, length)
|
|
||||||
|
|
||||||
cdef const_char_ptr csd_close(csd_t csd):
|
|
||||||
cdef const_char_ptr result = <Detector*>csd.Close()
|
|
||||||
del <Detector*>csd
|
|
||||||
return result
|
|
|
@ -1,9 +0,0 @@
|
||||||
# coding:utf8
|
|
||||||
|
|
||||||
cdef extern from "nscore.h":
|
|
||||||
# base: https://github.com/kmshi/miro/blob/5d7cdd679830169590a677632cd88a2fa27f81f5/tv/windows/plat/frontends/widgets/XULRunnerBrowser/xulrunnerbrowser.pyx
|
|
||||||
ctypedef PRUint32 nsresult
|
|
||||||
ctypedef PRUint32 PRBool
|
|
||||||
cdef enum:
|
|
||||||
NS_OK = 0
|
|
||||||
cdef PRUint32 NS_ERROR_OUT_OF_MEMORY = <nsresult>0x8007000eL
|
|
|
@ -1,9 +0,0 @@
|
||||||
# coding:utf8
|
|
||||||
|
|
||||||
cdef extern from "prtypes.h":
|
|
||||||
ctypedef unsigned int PRUint32
|
|
||||||
ctypedef int PRIntn
|
|
||||||
ctypedef PRIntn PRBool
|
|
||||||
cdef enum:
|
|
||||||
PR_TRUE = 1
|
|
||||||
PR_FALSE = 0
|
|
|
@ -1,5 +0,0 @@
|
||||||
# coding:utf8
|
|
||||||
|
|
||||||
cdef extern from "Python.h":
|
|
||||||
void * PyMem_Malloc(size_t)
|
|
||||||
void PyMem_Free(void *)
|
|
|
@ -1,4 +0,0 @@
|
||||||
# coding:utf8
|
|
||||||
|
|
||||||
cdef extern from "string.h":
|
|
||||||
cdef int strlen(char *s)
|
|
Loading…
Reference in a new issue