diff --git a/blogofile/cache.py b/blogofile/cache.py index 2906fe0..aac7b31 100644 --- a/blogofile/cache.py +++ b/blogofile/cache.py @@ -43,9 +43,9 @@ class HierarchicalCache(Cache): >>> c.sub.d['one.value.stuff'] = "whatever2" >>> c.sub.d.one.value.stuff 'whatever2' - >>> c.sub.d.one.value.items() + >>> list(c.sub.d.one.value.items()) [('stuff', 'whatever2')] - >>> c.sub.d.has_key("doesn't have this") + >>> "doesn't have this" in c.sub.d False """ def __getattr__(self, attr): diff --git a/blogofile/site_init/blog_features/_controllers/blog/post.py b/blogofile/site_init/blog_features/_controllers/blog/post.py index 6e1f434..b8c7f89 100644 --- a/blogofile/site_init/blog_features/_controllers/blog/post.py +++ b/blogofile/site_init/blog_features/_controllers/blog/post.py @@ -20,6 +20,8 @@ import pytz import yaml import logging +import lxml.html +import lxml.html.clean import blogofile_bf as bf @@ -142,7 +144,20 @@ def __parse_post_excerpting(self): # This is the only portion of blogofile that depends on it, # so commenting out for now.. to be rewritten later def __excerpt(self, num_words=50): - return "Blogofile post excerpting is broken right now, sorry." + """ Retrieve excerpt from article """ + if len(self.excerpt) == 0: + doc = lxml.html.fromstring(self.content) + # Kill scripts, CSS, forms, frames etc. + [[tree.drop_tree for tree in doc.findall(elem)] for elem in ( + 'style', 'noscript', 'script')] + clean = lxml.html.clean.clean_html(doc) + # Remove headers + [[tree.drop_tree() for tree in clean.findall(elem)] for elem in ( + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')] + text = clean.text_content().replace('\n', ' ').split(' ') + + text = [word for word in text if word != ''] + return ' '.join(text[:num_words]) + '...' # def __excerpt(self, num_words=50): # if len(self.excerpt) == 0: diff --git a/blogofile/tests/test_content.py b/blogofile/tests/test_content.py index 58e18bb..5eadf1d 100644 --- a/blogofile/tests/test_content.py +++ b/blogofile/tests/test_content.py @@ -2,7 +2,8 @@ import tempfile import shutil import os -import BeautifulSoup +import lxml.html +import lxml.etree from .. import main @@ -147,8 +148,8 @@ def testFeedLinksAreURLs(self): main.main("build") feed = open(os.path.join(self.build_path,"_site","blog","feed", "index.xml")).read() - soup = BeautifulSoup.BeautifulStoneSoup(feed) - for link in soup.findAll("link"): + feed_xml = lxml.etree.fromstring(feed.encode('utf-8')) + for link in feed_xml.findall("link"): assert(link.contents[0].startswith("http://")) def testCategoryLinksInPosts(self): @@ -173,10 +174,10 @@ def testCategoryLinksInPosts(self): #Open up one of the permapages: page = open(os.path.join(self.build_path,"_site","blog","2009", "08","16","this-is-a-test-post","index.html")).read() - soup = BeautifulSoup.BeautifulStoneSoup(page) - print(soup.findAll("a")) - assert soup.find("a",attrs={'href':'/blog/category/category-1'}) - assert soup.find("a",attrs={'href':'/blog/category/category-2'}) + html = lxml.html.fromstring(page) + print(html.findall("a")) + assert html.cssselect("a[href='/blog/category/category-1']") + assert html.cssselect("a[href='/blog/category/category-2']") def testReStructuredFilter(self): """Test to make sure reStructuredTest work well""" diff --git a/setup.py b/setup.py index 9575731..ae8d3b5 100644 --- a/setup.py +++ b/setup.py @@ -16,14 +16,14 @@ packages=["blogofile", "blogofile/site_init"], package_data = {"blogofile/site_init": ["*.zip"]}, install_requires =['mako', - 'BeautifulSoup', 'pytz', 'pyyaml', 'textile', 'markdown', 'argparse', 'pygments', - 'docutils'], + 'docutils', + 'lxml'], entry_points=""" [console_scripts] blogofile = blogofile.main:main