EnigmaCurry · Macha · Dec 18, 2010 · Dec 18, 2010
diff --git a/blogofile/cache.py b/blogofile/cache.py
@@ -43,9 +43,9 @@ class HierarchicalCache(Cache):
     >>> c.sub.d['one.value.stuff'] = "whatever2"
     >>> c.sub.d.one.value.stuff
     'whatever2'
-    >>> c.sub.d.one.value.items()
+    >>> list(c.sub.d.one.value.items())
     [('stuff', 'whatever2')]
-    >>> c.sub.d.has_key("doesn't have this")
+    >>> "doesn't have this" in c.sub.d
     False
     """
     def __getattr__(self, attr):

diff --git a/blogofile/site_init/blog_features/_controllers/blog/post.py b/blogofile/site_init/blog_features/_controllers/blog/post.py
@@ -20,6 +20,8 @@
 import pytz
 import yaml
 import logging
+import lxml.html
+import lxml.html.clean
 
 import blogofile_bf as bf
 
@@ -142,7 +144,20 @@ def __parse_post_excerpting(self):
     # This is the only portion of blogofile that depends on it,
     # so commenting out for now.. to be rewritten later
     def __excerpt(self, num_words=50):
-        return "Blogofile post excerpting is broken right now, sorry."
+        """ Retrieve excerpt from article """
+        if len(self.excerpt) == 0:
+            doc = lxml.html.fromstring(self.content)
+            # Kill scripts, CSS, forms, frames etc.
+            [[tree.drop_tree for tree in doc.findall(elem)] for elem in (
+                'style', 'noscript', 'script')]
+            clean = lxml.html.clean.clean_html(doc)
+            # Remove headers
+            [[tree.drop_tree() for tree in clean.findall(elem)] for elem in (
+                'h1', 'h2', 'h3', 'h4', 'h5', 'h6')]
+            text = clean.text_content().replace('\n', ' ').split(' ')
+
+            text = [word for word in text if word != '']
+            return ' '.join(text[:num_words]) + '...'
 
     # def __excerpt(self, num_words=50):
     #     if len(self.excerpt) == 0:

diff --git a/blogofile/tests/test_content.py b/blogofile/tests/test_content.py
@@ -2,7 +2,8 @@
 import tempfile
 import shutil
 import os
-import BeautifulSoup
+import lxml.html
+import lxml.etree
 from .. import main
 
 
@@ -147,8 +148,8 @@ def testFeedLinksAreURLs(self):
         main.main("build")
         feed = open(os.path.join(self.build_path,"_site","blog","feed",
                                  "index.xml")).read()
-        soup = BeautifulSoup.BeautifulStoneSoup(feed)
-        for link in soup.findAll("link"):
+        feed_xml = lxml.etree.fromstring(feed.encode('utf-8'))
+        for link in feed_xml.findall("link"):
             assert(link.contents[0].startswith("http://"))
 
     def testCategoryLinksInPosts(self):
@@ -173,10 +174,10 @@ def testCategoryLinksInPosts(self):
         #Open up one of the permapages:
         page = open(os.path.join(self.build_path,"_site","blog","2009",
                                  "08","16","this-is-a-test-post","index.html")).read()
-        soup = BeautifulSoup.BeautifulStoneSoup(page)
-        print(soup.findAll("a"))
-        assert soup.find("a",attrs={'href':'/blog/category/category-1'})
-        assert soup.find("a",attrs={'href':'/blog/category/category-2'})
+        html = lxml.html.fromstring(page)
+        print(html.findall("a"))
+        assert html.cssselect("a[href='/blog/category/category-1']")
+        assert html.cssselect("a[href='/blog/category/category-2']")
 
     def testReStructuredFilter(self):
         """Test to make sure reStructuredTest work well"""

diff --git a/setup.py b/setup.py
@@ -16,14 +16,14 @@
       packages=["blogofile", "blogofile/site_init"],
       package_data = {"blogofile/site_init": ["*.zip"]},
       install_requires =['mako',
-                         'BeautifulSoup',
                          'pytz',
                          'pyyaml',
                          'textile',
                          'markdown',
                          'argparse',
                          'pygments',
-                         'docutils'],
+                         'docutils',
+                         'lxml'],
       entry_points="""
       [console_scripts]
       blogofile = blogofile.main:main