refresh
This commit is contained in:
parent
b68dbdaf9d
commit
cf82f75bb3
8 changed files with 236 additions and 1 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -32,7 +32,9 @@ nosetests.xml
|
||||||
# PyGooglePlus
|
# PyGooglePlus
|
||||||
.idea/
|
.idea/
|
||||||
ext__
|
ext__
|
||||||
*.c
|
ext/libcharsetdetect/mozilla_
|
||||||
|
src/cchardet/*.c
|
||||||
|
src/cchardet/*.cpp
|
||||||
libcharsetdetect.dll
|
libcharsetdetect.dll
|
||||||
charsetdetect.h
|
charsetdetect.h
|
||||||
MANIFEST
|
MANIFEST
|
38
ext/libcharsetdetect/trchardet.c
Normal file
38
ext/libcharsetdetect/trchardet.c
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
#include "charsetdetect.h"
|
||||||
|
#include "stdio.h"
|
||||||
|
|
||||||
|
#define BUFFER_SIZE 100*1024
|
||||||
|
|
||||||
|
int main(int argc, const char * argv[]) {
|
||||||
|
csd_t csd = csd_open();
|
||||||
|
if (csd == (csd_t)-1) {
|
||||||
|
printf("csd_open failed\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int size;
|
||||||
|
char buf[BUFFER_SIZE] = {0};
|
||||||
|
|
||||||
|
while ((size = fread(buf, 1, sizeof(buf), stdin)) != 0) {
|
||||||
|
printf("CLIENT SENDING More data\n");
|
||||||
|
int result = csd_consider(csd, buf, size);
|
||||||
|
if (result < 0) {
|
||||||
|
printf("csd_consider failed\n");
|
||||||
|
return 3;
|
||||||
|
} else if (result == 0) {
|
||||||
|
// Already have enough data
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Only send one buffer actually, for testing
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *result = csd_close(csd);
|
||||||
|
if (result == NULL) {
|
||||||
|
printf("Unknown character set\n");
|
||||||
|
return 2;
|
||||||
|
} else {
|
||||||
|
printf("%s\n", result);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
59
setup2.py
Normal file
59
setup2.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import ez_setup
|
||||||
|
ez_setup.use_setuptools()
|
||||||
|
import os,sys,platform,shutil
|
||||||
|
import subprocess
|
||||||
|
from setuptools import setup, Extension
|
||||||
|
import distutils.spawn as ds
|
||||||
|
from Cython.Distutils import build_ext
|
||||||
|
|
||||||
|
root = os.getcwd()
|
||||||
|
ext_dir = os.path.join(root,'ext_')
|
||||||
|
src_dir = os.path.join(root,'src')
|
||||||
|
build_dir = os.path.join(root,'build')
|
||||||
|
cchardet_dir = os.path.join(src_dir,'cchardet')
|
||||||
|
cchardet_source = os.path.join(cchardet_dir,"cchardet2.pyx")
|
||||||
|
charsetdetect_dir = os.path.join(ext_dir, 'libcharsetdetect')
|
||||||
|
charsetdetect_build_dir = os.path.join(charsetdetect_dir,'build')
|
||||||
|
|
||||||
|
|
||||||
|
cchardet_module = Extension("_cchardet",
|
||||||
|
sources = [cchardet_source],
|
||||||
|
#libraries = ['charsetdetect'],
|
||||||
|
#include_dirs = [charsetdetect_dir],
|
||||||
|
#library_dirs = [charsetdetect_build_dir],
|
||||||
|
language = "c",
|
||||||
|
)
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name = 'cchardet',
|
||||||
|
author= 'PyYoshi',
|
||||||
|
url = r"https://github.com/PyYoshi/cChardet",
|
||||||
|
description = 'Universal encoding detector',
|
||||||
|
long_description= """This library is high speed universal character encoding detector. - binding to charsetdetect.
|
||||||
|
This library is faster than chardet.
|
||||||
|
""",
|
||||||
|
version = '0.1',
|
||||||
|
classifiers = [ # http://pypi.python.org/pypi?:action=list_classifiers
|
||||||
|
'Development Status :: 1 - Planning',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Programming Language :: Cython',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Topic :: Software Development :: Libraries',
|
||||||
|
],
|
||||||
|
keywords = [
|
||||||
|
'cython',
|
||||||
|
'chardet',
|
||||||
|
'universal character encoding detector',
|
||||||
|
'charsetdetect'
|
||||||
|
],
|
||||||
|
ext_package='cchardet',
|
||||||
|
package_dir = {'':src_dir},
|
||||||
|
packages = ['cchardet'],
|
||||||
|
cmdclass = {'build_ext': build_ext},
|
||||||
|
ext_modules = [
|
||||||
|
cchardet_module
|
||||||
|
],
|
||||||
|
)
|
109
src/cchardet/cchardet2.pyx
Normal file
109
src/cchardet/cchardet2.pyx
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
# coding:utf8
|
||||||
|
|
||||||
|
from libc.stdlib cimport malloc, free
|
||||||
|
|
||||||
|
cimport prtypes, src.cchardet.nscore
|
||||||
|
|
||||||
|
cdef extern from *:
|
||||||
|
ctypedef char* const_char_ptr "const char*"
|
||||||
|
|
||||||
|
cdef extern from "nsUniversalDetector.h":
|
||||||
|
cdef cppclass nsCharSetProber
|
||||||
|
cdef enum:
|
||||||
|
NUM_OF_CHARSET_PROBERS = 3
|
||||||
|
cdef enum nsInputState:
|
||||||
|
ePureAscii = 0
|
||||||
|
eEscAscii = 1
|
||||||
|
eHighbyte = 2
|
||||||
|
|
||||||
|
cdef unsigned int NS_FILTER_CHINESE_SIMPLIFIED = 0x01
|
||||||
|
cdef unsigned int NS_FILTER_CHINESE_TRADITIONAL = 0x02
|
||||||
|
cdef unsigned int NS_FILTER_JAPANESE = 0x04
|
||||||
|
cdef unsigned int NS_FILTER_KOREAN = 0x08
|
||||||
|
cdef unsigned int NS_FILTER_NON_CJK = 0x10
|
||||||
|
cdef unsigned int NS_FILTER_ALL = 0x1F
|
||||||
|
cdef unsigned int NS_FILTER_CHINESE = NS_FILTER_CHINESE_SIMPLIFIED | NS_FILTER_CHINESE_TRADITIONAL | NS_FILTER_JAPANESE | NS_FILTER_KOREAN
|
||||||
|
|
||||||
|
cdef class nsUniversalDetector:
|
||||||
|
cdef nsUniversalDetector(self, PRUint32 aLanguageFilter)
|
||||||
|
cdef nsresult HandleData(self, const_char_ptr aBuf, PRUint32 aLen)
|
||||||
|
cdef void DataEnd(self,)
|
||||||
|
|
||||||
|
cdef void _Report(self,const_char_ptr aCharset)
|
||||||
|
cdef void _Reset(self)
|
||||||
|
cdef nsInputState _mInputState
|
||||||
|
cdef PRBool _mDone
|
||||||
|
cdef PRBool _mInTag
|
||||||
|
cdef PRBool _mStart
|
||||||
|
cdef PRBool _mGotData
|
||||||
|
cdef char _mLastChar
|
||||||
|
cdef const_char_ptr _mDetectedCharset
|
||||||
|
cdef PRUInt32 _mBestGuess
|
||||||
|
cdef PRUint32 _mLanguageFilter
|
||||||
|
|
||||||
|
cdef nsCharSetProber *_mCharsetProber[NUM_OF_CHARSET_PROBERS]
|
||||||
|
cdef nsCharSetProber *_mEscCharSetProber
|
||||||
|
|
||||||
|
"""
|
||||||
|
cdef extern from *:
|
||||||
|
cdef class Detector(nsUniversalDetector):
|
||||||
|
cdef Detector(self, PRUint32 aLanguageFilter):
|
||||||
|
nsUniversalDetector(self, aLanguageFilter)
|
||||||
|
cdef int Consider(self, const_char_ptr data, int length)
|
||||||
|
cdef const_char_ptr Close(self, )
|
||||||
|
|
||||||
|
cdef void _Report(self, const_char_ptr aCharset)
|
||||||
|
cdef const_char_ptr *_mDetectedCharset"""
|
||||||
|
|
||||||
|
cdef class Detector(nsUniversalDetector):
|
||||||
|
cdef Detector(self, PRUint32 aLanguageFilter):
|
||||||
|
nsUniversalDetector(self, aLanguageFilter)
|
||||||
|
|
||||||
|
cdef void Report(self, const_char_ptr aCharset):
|
||||||
|
self._mDone = PR_TRUE
|
||||||
|
self._mDetectedCharset = aCharset
|
||||||
|
|
||||||
|
cdef int Consider(self, const_char_ptr data, int length):
|
||||||
|
if HandleData(data,length) == NS_ERROR_OUT_OF_MEMORY:
|
||||||
|
# Error, signal with a negative number
|
||||||
|
return -1
|
||||||
|
|
||||||
|
if self._mDone:
|
||||||
|
# Detected early
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Need more data!
|
||||||
|
return 1
|
||||||
|
|
||||||
|
cdef const_char_ptr Close(self):
|
||||||
|
self.DataEnd()
|
||||||
|
|
||||||
|
if not self._mDone:
|
||||||
|
if self._mInputState == eEscAscii:
|
||||||
|
return "ibm850"
|
||||||
|
elif self._mInputState == ePureAscii:
|
||||||
|
return "ASCII"
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._mDetectedCharset
|
||||||
|
|
||||||
|
cdef extern from *:
|
||||||
|
ctypedef void* csd_t
|
||||||
|
cdef csd_t csd_open()
|
||||||
|
cdef int csd_consider(csd_t csd, char* data, int length)
|
||||||
|
cdef const_char_ptr csd_close(csd_t csd)
|
||||||
|
|
||||||
|
cdef csd_t csd_open():
|
||||||
|
# TODO: capture exceptions thrown by "new" and return -1 in that case
|
||||||
|
# TODO: provide C-land with access to the language filter constructor argument
|
||||||
|
return Detector(NS_FILTER_ALL)
|
||||||
|
|
||||||
|
cdef int csd_consider(csd_t csd, const_char_ptr data, int length):
|
||||||
|
# return ((Detector*)csd)->Consider(data, length);
|
||||||
|
return <Detector*>csd.Consider(data, length)
|
||||||
|
|
||||||
|
cdef const_char_ptr csd_close(csd_t csd):
|
||||||
|
cdef const_char_ptr result = <Detector*>csd.Close()
|
||||||
|
del <Detector*>csd
|
||||||
|
return result
|
9
src/cchardet/nscore.pxd
Normal file
9
src/cchardet/nscore.pxd
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# coding:utf8
|
||||||
|
|
||||||
|
cdef extern from "nscore.h":
|
||||||
|
# base: https://github.com/kmshi/miro/blob/5d7cdd679830169590a677632cd88a2fa27f81f5/tv/windows/plat/frontends/widgets/XULRunnerBrowser/xulrunnerbrowser.pyx
|
||||||
|
ctypedef PRUint32 nsresult
|
||||||
|
ctypedef PRUint32 PRBool
|
||||||
|
cdef enum:
|
||||||
|
NS_OK = 0
|
||||||
|
cdef PRUint32 NS_ERROR_OUT_OF_MEMORY = <nsresult>0x8007000eL
|
9
src/cchardet/prtypes.pxd
Normal file
9
src/cchardet/prtypes.pxd
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# coding:utf8
|
||||||
|
|
||||||
|
cdef extern from "prtypes.h":
|
||||||
|
ctypedef unsigned int PRUint32
|
||||||
|
ctypedef int PRIntn
|
||||||
|
ctypedef PRIntn PRBool
|
||||||
|
cdef enum:
|
||||||
|
PR_TRUE = 1
|
||||||
|
PR_FALSE = 0
|
5
src/cchardet/python.pxd
Normal file
5
src/cchardet/python.pxd
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# coding:utf8
|
||||||
|
|
||||||
|
cdef extern from "Python.h":
|
||||||
|
void * PyMem_Malloc(size_t)
|
||||||
|
void PyMem_Free(void *)
|
4
src/cchardet/string.pxd
Normal file
4
src/cchardet/string.pxd
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
# coding:utf8
|
||||||
|
|
||||||
|
cdef extern from "string.h":
|
||||||
|
cdef int strlen(char *s)
|
Loading…
Reference in a new issue