Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add importer for bookie JSON export #566

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 100 additions & 2 deletions bookie/lib/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
BmarkMgr,
DBSession,
InvalidBookmark,
Hashed,
Readable,
)


Expand Down Expand Up @@ -70,6 +72,9 @@ def __new__(cls, *args, **kwargs):
if FBookmarkImporter.can_handle(args[0]):
return super(Importer, cls).__new__(FBookmarkImporter)

if BookieExportImporter.can_handle(args[0]):
return super(Importer, cls).__new__(BookieExportImporter)

return super(Importer, cls).__new__(Importer)

@staticmethod
Expand All @@ -81,7 +86,8 @@ def process(self, fulltext=None):
"""Meant to be implemented in subclasses"""
raise NotImplementedError("Please implement this in your importer")

def save_bookmark(self, url, desc, ext, tags, dt=None, is_private=False):
def save_bookmark(self, url, desc, ext, tags, dt=None, is_private=False,
readable=None, bookie_hash=None, clicks=0):
"""Save the bookmark to the db

:param url: bookmark url
Expand Down Expand Up @@ -110,11 +116,33 @@ def save_bookmark(self, url, desc, ext, tags, dt=None, is_private=False):
dt=dt,
inserted_by=IMPORTED,
is_private=is_private,
clicks=clicks,
)

if bookie_hash:
if bmark.hashed.clicks is None:
bmark.hashed.clicks = bookie_hash['clicks']
else:
bmark.hashed.clicks += bookie_hash['clicks']

# Add this hash to the list so that we can skip dupes in the
# same import set.
self.hash_list.add(check_hash)

if readable:
bmark.readable = Readable(
hash_id=bmark.hashed.hash_id,
content=readable['content'],
clean_content=readable['clean_content'],
imported=datetime.strptime(
readable['imported'],
'%Y-%m-%d %H:%M:%S'
),
content_type=readable['content_type'],
status_code=readable['status_code'],
status_message=readable['status_message']
)

return bmark

# If we don't store a bookmark then just return None back to the
Expand Down Expand Up @@ -468,7 +496,7 @@ def _is_firefox_format(json, can_handle):
Firefox json file has a variable "type" which is equal to
"text/x-moz-place-container"
"""
if json['type'] == FBookmarkImporter.MOZ_CONTAINER:
if 'type' in json and json['type'] == FBookmarkImporter.MOZ_CONTAINER:
can_handle = True

return can_handle
Expand Down Expand Up @@ -601,3 +629,73 @@ def is_good(child):
# fetch its content.
for bid in ids:
tasks.fetch_bmark_content.delay(bid)


class BookieExportImporter(Importer):
@staticmethod
def can_handle(file_io):
"""Check if this file is a Bookie json export

"""
if (file_io.closed):
file_io = open(file_io.name)
file_io.seek(0)

try:
bookie_json = json.load(file_io)
except ValueError:
file_io.seek(0)
return False

# make sure we reset the file_io object so that we can use it again
file_io.seek(0)
return True

def process(self):
"""Process an json bookie bookmarks export and import it
"""
count = 0
if (self.file_handle.closed):
self.file_handle = open(self.file_handle.name)

content = self.file_handle.read().decode("UTF-8")
root = json.loads(content)

# make a dictionary of unique bookmarks
bids = []

for json_bmark in root['bmarks']:
try:
bookmark = self.save_bookmark(
unicode(json_bmark['hashed']['url']),
unicode(json_bmark['description']),
unicode(json_bmark['extended']),
unicode(json_bmark['tag_str']),
dt=datetime.strptime(
json_bmark['stored'],
'%Y-%m-%d %H:%M:%S'
),
is_private=json_bmark['is_private'],
readable=json_bmark['readable'],
bookie_hash=json_bmark['hashed'],
clicks=json_bmark['clicks'],
)
count += 1
DBSession.flush()
except InvalidBookmark:
bookmark = None
if bookmark:
bids.append(bookmark.bid)
if count % COMMIT_SIZE == 0:
transaction.commit()
# Start a new transaction for the next grouping.
transaction.begin()

# Commit any that are left since the last commit performed.
transaction.commit()

from bookie.bcelery import tasks
# For each bookmark in this set that we saved, sign up to
# put its content into the fulltext index.
for bid in bids:
tasks.fulltext_index_bookmark.delay(bid, None)
7 changes: 5 additions & 2 deletions bookie/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,14 +545,15 @@ def popular(limit=50, page=0, with_tags=False):

@staticmethod
def store(url, username, desc, ext, tags, dt=None, inserted_by=None,
is_private=False):
is_private=False, clicks=0):
"""Store a bookmark

:param url: bookmarked url
:param desc: the one line description
:param ext: the extended description/notes
:param dt: The original stored time of this bmark
:param fulltext: an instance of a fulltext handler
:param clicks: an initil number of clicks

"""
parsed_url = urlparse(url)
Expand All @@ -566,6 +567,7 @@ def store(url, username, desc, ext, tags, dt=None, inserted_by=None,
ext=ext,
tags=tags,
is_private=is_private,
clicks=clicks,
)

mark.inserted_by = inserted_by
Expand Down Expand Up @@ -685,7 +687,7 @@ class Bmark(Base):
uselist=False)

def __init__(self, url, username, desc=None, ext=None, tags=None,
is_private=False):
is_private=False, clicks=0):
"""Create a new bmark instance

:param url: string of the url to be added as a bookmark
Expand All @@ -707,6 +709,7 @@ def __init__(self, url, username, desc=None, ext=None, tags=None,
self.description = desc
self.extended = ext
self.is_private = is_private
self.clicks = clicks

# tags are space separated
if tags:
Expand Down
1 change: 1 addition & 0 deletions bookie/tests/test_utils/bookie_export.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"count": 2, "date": "2015-07-05 04:00:34.006299", "bmarks": [{"username": "user1", "updated": "2015-07-03 06:44:09", "extended": "", "description": "some page title 1", "bid": 156260, "readable": {"imported": "2014-07-12 04:05:01", "status_code": 1, "bid": 156260, "content": "<div id=\"readabilityBody\"><div class=\"markdown-body entry-content\" itemprop=\"mainContentOfPage\"></div></div>", "clean_content": null, "content_type": "text/html", "status_message": null, "hash_id": null}, "hashed": {"url": "https://some.fakeurl1.com", "clicks": 40, "hash_id": "7aa785af87362e"}, "stored": "2014-07-12 04:05:01", "inserted_by": "chrome_ext", "tag_str": "tag1 tag3", "clicks": 21, "is_private": false, "hash_id": "7aa785af87362e"}, {"username": "user1", "updated": "2015-05-27 15:39:17", "extended": "", "description": "another page title 2", "bid": 147996, "readable": {"imported": "2014-06-06 03:16:24", "status_code": 1, "bid": 147996, "content": "<div id=\"readabilityBody\"><div id=\"discussion_bucket\" class=\"tab-content\">\n \n\n \n\n </div>\n </div>", "clean_content": null, "content_type": "text/html", "status_message": null, "hash_id": null}, "hashed": {"url": "https://another.fakeurl2.com", "clicks": 56, "hash_id": "3d31a4c2c6c5a4"}, "stored": "2014-06-06 03:16:24", "inserted_by": "chrome_ext", "tag_str": "chromebook", "clicks": 42, "is_private": false, "hash_id": "3d31a4c2c6c5a4"}]}
109 changes: 108 additions & 1 deletion bookie/tests/test_utils/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from bookie.lib.importer import DelXMLImporter
from bookie.lib.importer import GBookmarkImporter
from bookie.lib.importer import FBookmarkImporter
from bookie.lib.importer import BookieExportImporter

from bookie.tests import TestViewBase
from bookie.tests import empty_db
Expand All @@ -31,7 +32,7 @@
class TestImports(unittest.TestCase):

def _delicious_data_test(self):
"""Test that we find the correct set of declicious data after import"""
"""Test that we find the correct set of delicious data after import"""
# Blatant copy/paste, but I'm on a plane right now so oh well.
# Now let's do some db sanity checks.
res = Bmark.query.all()
Expand Down Expand Up @@ -214,6 +215,55 @@ def _firefox_data_test(self):
date_should_be = datetime.fromtimestamp(1394649032847102/1e6)
self.assertEqual(date_should_be, found.stored)

def _bookie_export_data_test(self):
"""Verify we find the correct Bookie export bmark data after import"""
res = Bmark.query.all()
self.assertEqual(
len(res),
2,
"We should have 2 results, we got: " + str(len(res)))

# Verify we can find a bookmark by url and check tags, etc
check_url = 'https://some.fakeurl1.com'
check_url_hashed = generate_hash(check_url)
found = Bmark.query.filter(Bmark.hash_id == check_url_hashed).one()

self.assertTrue(
found.hashed.url == check_url, "The url should match our search")
self.assertEqual(
len(found.tags),
2,
"We should have gotten 2 tags, got: " + str(len(found.tags)))

# and check we have a right tag or two
self.assertTrue(
'tag3' in found.tag_string(),
'tag3 should be a valid tag in the bookmark')

# and check the timestamp is correct
# relative to user's timezone
date_should_be = datetime.strptime('2014-07-12 04:05:01',
'%Y-%m-%d %H:%M:%S')
self.assertEqual(date_should_be, found.stored)

# and check we populated clicks
self.assertEquals(
found.clicks,
21)

# and clicks in the hashed record
self.assertEquals(
found.hashed.clicks,
40)

# check that the Readable entry is right
readable_content_should_be =\
'<div id="readabilityBody">'\
'<div class="markdown-body entry-content" '\
'itemprop="mainContentOfPage"></div>'\
'</div>'
self.assertEqual(readable_content_should_be, found.readable.content)


class ImporterBaseTest(TestImports):
"""Verify the base import class is working"""
Expand Down Expand Up @@ -252,6 +302,18 @@ def test_factory_gives_google(self):
isinstance(imp, GBookmarkImporter),
"Instance should be a GBookmarkImporter instance")

def test_factory_gives_bookie(self):
""""Verify that the base importer will give BookieExportImporter"""
loc = os.path.dirname(__file__)
bookie_file = os.path.join(loc, 'bookie_export.json')

with open(bookie_file) as bookie_io:
imp = Importer(bookie_io, username=u"admin")

self.assertTrue(
isinstance(imp, BookieExportImporter),
"Instance should be a BookieExportImporter instance")


class ImportDeliciousTest(TestImports):
"""Test the Bookie importer for delicious"""
Expand Down Expand Up @@ -526,6 +588,51 @@ def test_nested_folder(self):
found.hashed.url == check_url, "The url should match our search")


class ImportBookieExportTest(TestImports):
"""Test the Bookie importer for Bookie JSON export"""

def _get_file(self):
loc = os.path.dirname(__file__)
del_file = os.path.join(loc, 'bookie_export.json')

return open(del_file)

def tearDown(self):
"""Regular tear down method"""
empty_db()

def test_is_bookie_export_file(self):
"""Verify that this is a Bookie export json file"""
good_file = self._get_file()

self.assertTrue(
BookieExportImporter.can_handle(good_file),
"BookieExportImporter should handle this file")

good_file.close()

def test_is_not_bookie_export_file(self):
"""And that it returns false when it should"""
bad_file = StringIO.StringIO()
bad_file.write('failing tests please')
bad_file.seek(0)

self.assertTrue(
not BookieExportImporter.can_handle(bad_file),
"BookieExportImporter cannot handle this file")

bad_file.close()

def test_import_process(self):
"""Verify importer inserts the correct Bookie export bookmarks"""
good_file = self._get_file()
imp = Importer(good_file, username=u"admin")
imp.process()

# now let's do some db sanity checks
self._bookie_export_data_test()


class ImportViews(TestViewBase):
"""Test the web import"""

Expand Down