refresh
This commit is contained in:
parent
b68dbdaf9d
commit
cf82f75bb3
8 changed files with 236 additions and 1 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -32,7 +32,9 @@ nosetests.xml
|
|||
# PyGooglePlus
|
||||
.idea/
|
||||
ext__
|
||||
*.c
|
||||
ext/libcharsetdetect/mozilla_
|
||||
src/cchardet/*.c
|
||||
src/cchardet/*.cpp
|
||||
libcharsetdetect.dll
|
||||
charsetdetect.h
|
||||
MANIFEST
|
38
ext/libcharsetdetect/trchardet.c
Normal file
38
ext/libcharsetdetect/trchardet.c
Normal file
|
@ -0,0 +1,38 @@
|
|||
#include "charsetdetect.h"
|
||||
#include "stdio.h"
|
||||
|
||||
#define BUFFER_SIZE 100*1024
|
||||
|
||||
int main(int argc, const char * argv[]) {
|
||||
csd_t csd = csd_open();
|
||||
if (csd == (csd_t)-1) {
|
||||
printf("csd_open failed\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
int size;
|
||||
char buf[BUFFER_SIZE] = {0};
|
||||
|
||||
while ((size = fread(buf, 1, sizeof(buf), stdin)) != 0) {
|
||||
printf("CLIENT SENDING More data\n");
|
||||
int result = csd_consider(csd, buf, size);
|
||||
if (result < 0) {
|
||||
printf("csd_consider failed\n");
|
||||
return 3;
|
||||
} else if (result == 0) {
|
||||
// Already have enough data
|
||||
break;
|
||||
}
|
||||
// Only send one buffer actually, for testing
|
||||
break;
|
||||
}
|
||||
|
||||
const char *result = csd_close(csd);
|
||||
if (result == NULL) {
|
||||
printf("Unknown character set\n");
|
||||
return 2;
|
||||
} else {
|
||||
printf("%s\n", result);
|
||||
return 0;
|
||||
}
|
||||
}
|
59
setup2.py
Normal file
59
setup2.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
import ez_setup
|
||||
ez_setup.use_setuptools()
|
||||
import os,sys,platform,shutil
|
||||
import subprocess
|
||||
from setuptools import setup, Extension
|
||||
import distutils.spawn as ds
|
||||
from Cython.Distutils import build_ext
|
||||
|
||||
root = os.getcwd()
|
||||
ext_dir = os.path.join(root,'ext_')
|
||||
src_dir = os.path.join(root,'src')
|
||||
build_dir = os.path.join(root,'build')
|
||||
cchardet_dir = os.path.join(src_dir,'cchardet')
|
||||
cchardet_source = os.path.join(cchardet_dir,"cchardet2.pyx")
|
||||
charsetdetect_dir = os.path.join(ext_dir, 'libcharsetdetect')
|
||||
charsetdetect_build_dir = os.path.join(charsetdetect_dir,'build')
|
||||
|
||||
|
||||
cchardet_module = Extension("_cchardet",
|
||||
sources = [cchardet_source],
|
||||
#libraries = ['charsetdetect'],
|
||||
#include_dirs = [charsetdetect_dir],
|
||||
#library_dirs = [charsetdetect_build_dir],
|
||||
language = "c",
|
||||
)
|
||||
|
||||
setup(
|
||||
name = 'cchardet',
|
||||
author= 'PyYoshi',
|
||||
url = r"https://github.com/PyYoshi/cChardet",
|
||||
description = 'Universal encoding detector',
|
||||
long_description= """This library is high speed universal character encoding detector. - binding to charsetdetect.
|
||||
This library is faster than chardet.
|
||||
""",
|
||||
version = '0.1',
|
||||
classifiers = [ # http://pypi.python.org/pypi?:action=list_classifiers
|
||||
'Development Status :: 1 - Planning',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Cython',
|
||||
'Programming Language :: Python',
|
||||
'Topic :: Software Development :: Libraries',
|
||||
],
|
||||
keywords = [
|
||||
'cython',
|
||||
'chardet',
|
||||
'universal character encoding detector',
|
||||
'charsetdetect'
|
||||
],
|
||||
ext_package='cchardet',
|
||||
package_dir = {'':src_dir},
|
||||
packages = ['cchardet'],
|
||||
cmdclass = {'build_ext': build_ext},
|
||||
ext_modules = [
|
||||
cchardet_module
|
||||
],
|
||||
)
|
109
src/cchardet/cchardet2.pyx
Normal file
109
src/cchardet/cchardet2.pyx
Normal file
|
@ -0,0 +1,109 @@
|
|||
# coding:utf8
|
||||
|
||||
from libc.stdlib cimport malloc, free
|
||||
|
||||
cimport prtypes, src.cchardet.nscore
|
||||
|
||||
cdef extern from *:
|
||||
ctypedef char* const_char_ptr "const char*"
|
||||
|
||||
cdef extern from "nsUniversalDetector.h":
|
||||
cdef cppclass nsCharSetProber
|
||||
cdef enum:
|
||||
NUM_OF_CHARSET_PROBERS = 3
|
||||
cdef enum nsInputState:
|
||||
ePureAscii = 0
|
||||
eEscAscii = 1
|
||||
eHighbyte = 2
|
||||
|
||||
cdef unsigned int NS_FILTER_CHINESE_SIMPLIFIED = 0x01
|
||||
cdef unsigned int NS_FILTER_CHINESE_TRADITIONAL = 0x02
|
||||
cdef unsigned int NS_FILTER_JAPANESE = 0x04
|
||||
cdef unsigned int NS_FILTER_KOREAN = 0x08
|
||||
cdef unsigned int NS_FILTER_NON_CJK = 0x10
|
||||
cdef unsigned int NS_FILTER_ALL = 0x1F
|
||||
cdef unsigned int NS_FILTER_CHINESE = NS_FILTER_CHINESE_SIMPLIFIED | NS_FILTER_CHINESE_TRADITIONAL | NS_FILTER_JAPANESE | NS_FILTER_KOREAN
|
||||
|
||||
cdef class nsUniversalDetector:
|
||||
cdef nsUniversalDetector(self, PRUint32 aLanguageFilter)
|
||||
cdef nsresult HandleData(self, const_char_ptr aBuf, PRUint32 aLen)
|
||||
cdef void DataEnd(self,)
|
||||
|
||||
cdef void _Report(self,const_char_ptr aCharset)
|
||||
cdef void _Reset(self)
|
||||
cdef nsInputState _mInputState
|
||||
cdef PRBool _mDone
|
||||
cdef PRBool _mInTag
|
||||
cdef PRBool _mStart
|
||||
cdef PRBool _mGotData
|
||||
cdef char _mLastChar
|
||||
cdef const_char_ptr _mDetectedCharset
|
||||
cdef PRUInt32 _mBestGuess
|
||||
cdef PRUint32 _mLanguageFilter
|
||||
|
||||
cdef nsCharSetProber *_mCharsetProber[NUM_OF_CHARSET_PROBERS]
|
||||
cdef nsCharSetProber *_mEscCharSetProber
|
||||
|
||||
"""
|
||||
cdef extern from *:
|
||||
cdef class Detector(nsUniversalDetector):
|
||||
cdef Detector(self, PRUint32 aLanguageFilter):
|
||||
nsUniversalDetector(self, aLanguageFilter)
|
||||
cdef int Consider(self, const_char_ptr data, int length)
|
||||
cdef const_char_ptr Close(self, )
|
||||
|
||||
cdef void _Report(self, const_char_ptr aCharset)
|
||||
cdef const_char_ptr *_mDetectedCharset"""
|
||||
|
||||
cdef class Detector(nsUniversalDetector):
|
||||
cdef Detector(self, PRUint32 aLanguageFilter):
|
||||
nsUniversalDetector(self, aLanguageFilter)
|
||||
|
||||
cdef void Report(self, const_char_ptr aCharset):
|
||||
self._mDone = PR_TRUE
|
||||
self._mDetectedCharset = aCharset
|
||||
|
||||
cdef int Consider(self, const_char_ptr data, int length):
|
||||
if HandleData(data,length) == NS_ERROR_OUT_OF_MEMORY:
|
||||
# Error, signal with a negative number
|
||||
return -1
|
||||
|
||||
if self._mDone:
|
||||
# Detected early
|
||||
return 0
|
||||
|
||||
# Need more data!
|
||||
return 1
|
||||
|
||||
cdef const_char_ptr Close(self):
|
||||
self.DataEnd()
|
||||
|
||||
if not self._mDone:
|
||||
if self._mInputState == eEscAscii:
|
||||
return "ibm850"
|
||||
elif self._mInputState == ePureAscii:
|
||||
return "ASCII"
|
||||
|
||||
return None
|
||||
|
||||
return self._mDetectedCharset
|
||||
|
||||
cdef extern from *:
|
||||
ctypedef void* csd_t
|
||||
cdef csd_t csd_open()
|
||||
cdef int csd_consider(csd_t csd, char* data, int length)
|
||||
cdef const_char_ptr csd_close(csd_t csd)
|
||||
|
||||
cdef csd_t csd_open():
|
||||
# TODO: capture exceptions thrown by "new" and return -1 in that case
|
||||
# TODO: provide C-land with access to the language filter constructor argument
|
||||
return Detector(NS_FILTER_ALL)
|
||||
|
||||
cdef int csd_consider(csd_t csd, const_char_ptr data, int length):
|
||||
# return ((Detector*)csd)->Consider(data, length);
|
||||
return <Detector*>csd.Consider(data, length)
|
||||
|
||||
cdef const_char_ptr csd_close(csd_t csd):
|
||||
cdef const_char_ptr result = <Detector*>csd.Close()
|
||||
del <Detector*>csd
|
||||
return result
|
9
src/cchardet/nscore.pxd
Normal file
9
src/cchardet/nscore.pxd
Normal file
|
@ -0,0 +1,9 @@
|
|||
# coding:utf8
|
||||
|
||||
cdef extern from "nscore.h":
|
||||
# base: https://github.com/kmshi/miro/blob/5d7cdd679830169590a677632cd88a2fa27f81f5/tv/windows/plat/frontends/widgets/XULRunnerBrowser/xulrunnerbrowser.pyx
|
||||
ctypedef PRUint32 nsresult
|
||||
ctypedef PRUint32 PRBool
|
||||
cdef enum:
|
||||
NS_OK = 0
|
||||
cdef PRUint32 NS_ERROR_OUT_OF_MEMORY = <nsresult>0x8007000eL
|
9
src/cchardet/prtypes.pxd
Normal file
9
src/cchardet/prtypes.pxd
Normal file
|
@ -0,0 +1,9 @@
|
|||
# coding:utf8
|
||||
|
||||
cdef extern from "prtypes.h":
|
||||
ctypedef unsigned int PRUint32
|
||||
ctypedef int PRIntn
|
||||
ctypedef PRIntn PRBool
|
||||
cdef enum:
|
||||
PR_TRUE = 1
|
||||
PR_FALSE = 0
|
5
src/cchardet/python.pxd
Normal file
5
src/cchardet/python.pxd
Normal file
|
@ -0,0 +1,5 @@
|
|||
# coding:utf8
|
||||
|
||||
cdef extern from "Python.h":
|
||||
void * PyMem_Malloc(size_t)
|
||||
void PyMem_Free(void *)
|
4
src/cchardet/string.pxd
Normal file
4
src/cchardet/string.pxd
Normal file
|
@ -0,0 +1,4 @@
|
|||
# coding:utf8
|
||||
|
||||
cdef extern from "string.h":
|
||||
cdef int strlen(char *s)
|
Loading…
Reference in a new issue