Skip to content
This repository has been archived by the owner on Apr 16, 2019. It is now read-only.

Replace BeautifulSoup with lxml in Python 3 branch. #59

Open
wants to merge 2 commits into
base: py3k
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions blogofile/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ class HierarchicalCache(Cache):
>>> c.sub.d['one.value.stuff'] = "whatever2"
>>> c.sub.d.one.value.stuff
'whatever2'
>>> c.sub.d.one.value.items()
>>> list(c.sub.d.one.value.items())
[('stuff', 'whatever2')]
>>> c.sub.d.has_key("doesn't have this")
>>> "doesn't have this" in c.sub.d
False
"""
def __getattr__(self, attr):
Expand Down
17 changes: 16 additions & 1 deletion blogofile/site_init/blog_features/_controllers/blog/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import pytz
import yaml
import logging
import lxml.html
import lxml.html.clean

import blogofile_bf as bf

Expand Down Expand Up @@ -142,7 +144,20 @@ def __parse_post_excerpting(self):
# This is the only portion of blogofile that depends on it,
# so commenting out for now.. to be rewritten later
def __excerpt(self, num_words=50):
return "Blogofile post excerpting is broken right now, sorry."
""" Retrieve excerpt from article """
if len(self.excerpt) == 0:
doc = lxml.html.fromstring(self.content)
# Kill scripts, CSS, forms, frames etc.
[[tree.drop_tree for tree in doc.findall(elem)] for elem in (
'style', 'noscript', 'script')]
clean = lxml.html.clean.clean_html(doc)
# Remove headers
[[tree.drop_tree() for tree in clean.findall(elem)] for elem in (
'h1', 'h2', 'h3', 'h4', 'h5', 'h6')]
text = clean.text_content().replace('\n', ' ').split(' ')

text = [word for word in text if word != '']
return ' '.join(text[:num_words]) + '...'

# def __excerpt(self, num_words=50):
# if len(self.excerpt) == 0:
Expand Down
15 changes: 8 additions & 7 deletions blogofile/tests/test_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import tempfile
import shutil
import os
import BeautifulSoup
import lxml.html
import lxml.etree
from .. import main


Expand Down Expand Up @@ -147,8 +148,8 @@ def testFeedLinksAreURLs(self):
main.main("build")
feed = open(os.path.join(self.build_path,"_site","blog","feed",
"index.xml")).read()
soup = BeautifulSoup.BeautifulStoneSoup(feed)
for link in soup.findAll("link"):
feed_xml = lxml.etree.fromstring(feed.encode('utf-8'))
for link in feed_xml.findall("link"):
assert(link.contents[0].startswith("http://"))

def testCategoryLinksInPosts(self):
Expand All @@ -173,10 +174,10 @@ def testCategoryLinksInPosts(self):
#Open up one of the permapages:
page = open(os.path.join(self.build_path,"_site","blog","2009",
"08","16","this-is-a-test-post","index.html")).read()
soup = BeautifulSoup.BeautifulStoneSoup(page)
print(soup.findAll("a"))
assert soup.find("a",attrs={'href':'/blog/category/category-1'})
assert soup.find("a",attrs={'href':'/blog/category/category-2'})
html = lxml.html.fromstring(page)
print(html.findall("a"))
assert html.cssselect("a[href='/blog/category/category-1']")
assert html.cssselect("a[href='/blog/category/category-2']")

def testReStructuredFilter(self):
"""Test to make sure reStructuredTest work well"""
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
packages=["blogofile", "blogofile/site_init"],
package_data = {"blogofile/site_init": ["*.zip"]},
install_requires =['mako',
'BeautifulSoup',
'pytz',
'pyyaml',
'textile',
'markdown',
'argparse',
'pygments',
'docutils'],
'docutils',
'lxml'],
entry_points="""
[console_scripts]
blogofile = blogofile.main:main
Expand Down