Add cchardetect
CLI script, with similar output to chardet's chardetect
This commit is contained in:
parent
c991f86f6f
commit
b5a00f7abe
4 changed files with 33 additions and 2 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -10,7 +10,6 @@ dist
|
||||||
build
|
build
|
||||||
eggs
|
eggs
|
||||||
parts
|
parts
|
||||||
bin
|
|
||||||
var
|
var
|
||||||
sdist
|
sdist
|
||||||
develop-eggs
|
develop-eggs
|
||||||
|
|
1
TODO.md
1
TODO.md
|
@ -1,2 +1 @@
|
||||||
- Implement cli tool (like chardet cli)
|
|
||||||
- Improve docs
|
- Improve docs
|
||||||
|
|
32
bin/cchardetect
Executable file
32
bin/cchardetect
Executable file
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
from __future__ import print_function, unicode_literals
|
||||||
|
import argparse
|
||||||
|
import cchardet
|
||||||
|
|
||||||
|
|
||||||
|
def read_chunks(f, chunk_size):
|
||||||
|
chunk = f.read(chunk_size)
|
||||||
|
while chunk:
|
||||||
|
yield chunk
|
||||||
|
chunk = f.read(chunk_size)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('files', nargs='+', help="Files to detect encoding of", type=argparse.FileType('r'))
|
||||||
|
parser.add_argument('--chunk-size', type=int, default=(256 * 1024))
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for f in args.files:
|
||||||
|
detector = cchardet.UniversalDetector()
|
||||||
|
for chunk in read_chunks(f, args.chunk_size):
|
||||||
|
detector.feed(chunk)
|
||||||
|
detector.close()
|
||||||
|
print('{file.name}: {result[encoding]} with confidence {result[confidence]}'.format(
|
||||||
|
file=f,
|
||||||
|
result=detector.result
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
1
setup.py
1
setup.py
|
@ -137,6 +137,7 @@ setup(
|
||||||
cmdclass={'build_ext': build_ext},
|
cmdclass={'build_ext': build_ext},
|
||||||
package_dir={'': 'src'},
|
package_dir={'': 'src'},
|
||||||
packages=['cchardet', ],
|
packages=['cchardet', ],
|
||||||
|
scripts=['bin/cchardetect'],
|
||||||
ext_modules=[
|
ext_modules=[
|
||||||
cchardet_module
|
cchardet_module
|
||||||
],
|
],
|
||||||
|
|
Loading…
Reference in a new issue