From f729e115367be262f244a3f9cdd58216528452c9 Mon Sep 17 00:00:00 2001 From: PyYoshi Date: Fri, 14 Apr 2017 10:33:26 +0900 Subject: [PATCH] fix an issue where UTF-8 with a BOM would not be detected as UTF-8-SIG ( fix #28 ) --- src/ext/uchardet | 2 +- src/tests/test.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ext/uchardet b/src/ext/uchardet index 795e982..a2d24a5 160000 --- a/src/ext/uchardet +++ b/src/ext/uchardet @@ -1 +1 @@ -Subproject commit 795e982965404568e3a8354e69b08e71137ee6ff +Subproject commit a2d24a50fa23d71f543501168bfe9b3a859c83cd diff --git a/src/tests/test.py b/src/tests/test.py index 88919b7..879fae5 100644 --- a/src/tests/test.py +++ b/src/tests/test.py @@ -102,3 +102,15 @@ class TestCChardet(): except LookupError as e: print("LookupError: { file=%s, encoding=%s }" % (testfile, detected_encoding["encoding"])) raise e + + def test_utf8_with_bom(self): + sample = b'\xEF\xBB\xBF' + detected_encoding = cchardet.detect(sample) + eq_( + "utf-8-sig", + detected_encoding['encoding'].lower(), + 'Expected %s, but got %s' % ( + "utf-8-sig", + detected_encoding['encoding'].lower() + ) + )