From 66ee23d3929dc5d5ece99f62cf6ba63e0f5fd87f Mon Sep 17 00:00:00 2001 From: mahdi Date: Wed, 3 May 2017 17:51:22 +0430 Subject: [PATCH] gzip pages decoder added --- src/boilerpipe/extract/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index 0f8a2a5..fb225f0 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -41,6 +41,16 @@ def __init__(self, extractor='DefaultExtractor', **kwargs): encoding = connection.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = chardet.detect(self.data)['encoding'] + try: + import gzip + import StringIO + data = StringIO.StringIO(self.data) + gzipper = gzip.GzipFile(fileobj=data) + self.data = gzipper.read() + #self.data = gzip.decompress(self.data) + except Exception as inst: + #print inst + pass try: self.data = unicode(self.data, encoding) except NameError: