From b5a00f7abedac7504015e718bd57cd0609e11a84 Mon Sep 17 00:00:00 2001 From: Craig de Stigter Date: Wed, 10 May 2017 11:15:42 +1200 Subject: [PATCH] Add `cchardetect` CLI script, with similar output to chardet's `chardetect` --- .gitignore | 1 - TODO.md | 1 - bin/cchardetect | 32 ++++++++++++++++++++++++++++++++ setup.py | 1 + 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100755 bin/cchardetect diff --git a/.gitignore b/.gitignore index 459af73..4124f23 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ dist build eggs parts -bin var sdist develop-eggs diff --git a/TODO.md b/TODO.md index 1bf4c34..1796bac 100644 --- a/TODO.md +++ b/TODO.md @@ -1,2 +1 @@ -- Implement cli tool (like chardet cli) - Improve docs diff --git a/bin/cchardetect b/bin/cchardetect new file mode 100755 index 0000000..dbad06f --- /dev/null +++ b/bin/cchardetect @@ -0,0 +1,32 @@ +#!/usr/bin/env python +from __future__ import print_function, unicode_literals +import argparse +import cchardet + + +def read_chunks(f, chunk_size): + chunk = f.read(chunk_size) + while chunk: + yield chunk + chunk = f.read(chunk_size) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('files', nargs='+', help="Files to detect encoding of", type=argparse.FileType('r')) + parser.add_argument('--chunk-size', type=int, default=(256 * 1024)) + args = parser.parse_args() + + for f in args.files: + detector = cchardet.UniversalDetector() + for chunk in read_chunks(f, args.chunk_size): + detector.feed(chunk) + detector.close() + print('{file.name}: {result[encoding]} with confidence {result[confidence]}'.format( + file=f, + result=detector.result + )) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index 1639fae..7251d8f 100644 --- a/setup.py +++ b/setup.py @@ -137,6 +137,7 @@ setup( cmdclass={'build_ext': build_ext}, package_dir={'': 'src'}, packages=['cchardet', ], + scripts=['bin/cchardetect'], ext_modules=[ cchardet_module ],