Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add Gmail takeout mbox import #5

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ Your location history records latitude, longitude and timestame for where Google

$ google-takeout-to-sqlite location-history takeout.db ~/Downloads/takeout-20190530.zip

## Email History

You can import your emails from your Gmail mbox using this command:

$ google-takeout-to-sqlite mbox takeout.db ~/Downloads/gmail.mbox

## Browsing your data with Datasette

Once you have imported Google data into a SQLite database file you can browse your data using [Datasette](https://github.com/simonw/datasette). Install Datasette like so:
Expand Down
21 changes: 21 additions & 0 deletions google_takeout_to_sqlite/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,24 @@ def my_activity(db_path, zip_path):
db = sqlite_utils.Database(db_path)
zf = zipfile.ZipFile(zip_path)
utils.save_location_history(db, zf)


@cli.command(name="mbox")
@click.argument(
"db_path",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
required=True,
)
@click.argument(
"mbox_path",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
required=True,
)
def my_mbox(db_path, mbox_path):
"""
Import all emails from Gmail mbox to SQLite

Usage: google-takeout-to-sqlite mbox mygmail.db /path/to/gmail.mbox
"""
db = sqlite_utils.Database(db_path)
utils.save_emails(db, mbox_path)
119 changes: 119 additions & 0 deletions google_takeout_to_sqlite/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import json
import hashlib
import datetime
import email
import mailbox
import traceback
from rich.progress import track
from email.utils import parsedate_tz, mktime_tz


def save_my_activity(db, zf):
Expand Down Expand Up @@ -53,3 +58,117 @@ def id_for_location_history(row):
datetime.datetime.utcfromtimestamp(int(row["timestampMs"]) / 1000).isoformat(),
first_six,
)


def get_mbox(mbox_file):
num_errors = 0
print("Preparing to process emails...")
mbox = mailbox.mbox(mbox_file)
print("Processing {} emails".format(len(mbox)))

# These are all the Gmail email fields available
# ['X-GM-THRID', 'X-Gmail-Labels', 'Delivered-To', 'Received', 'Received',
# 'Return-Path', 'Received', 'Received-SPF', 'Authentication-Results',
# 'Received', 'Mailing-List', 'Precedence', 'List-Post', 'List-Help',
# 'List-Unsubscribe', 'List-Subscribe', 'Delivered-To', 'Received',
# 'Message-ID', 'Date', 'From', 'To', 'MIME-Version', 'Content-Type',
# 'Content-Transfer-Encoding', 'X-Nabble-From', 'X-pstn-neptune',
# 'X-pstn-levels', 'X-pstn-settings', 'X-pstn-addresses', 'Subject']

for email in track(mbox):
try:
message = {}
message["Message-Id"] = email["Message-Id"]
message["X-GM-THRID"] = email["X-GM-THRID"]
message["X-Gmail-Labels"] = email["X-Gmail-Labels"]

# These following try/excepts are here because for some reason
# these items returned from the mbox module are sometimes strings
# and sometimes headers and sometimes None.

try:
email["From"].decode("utf-8")
except AttributeError:
message["From"] = str(email["From"])
try:
email["To"].decode("utf-8")
except AttributeError:
message["To"] = str(email["To"])

try:
email["Subject"].decode("utf-8")
except AttributeError:
message["Subject"] = str(email["Subject"])

message["date"] = get_message_date(email.get("Date"), email.get_from())
message["body"] = get_email_body(email)

yield message
except (TypeError, ValueError, AttributeError, LookupError) as e:
# How does this project want to handle logging? For now we're just
# printing out variables
num_errors = num_errors + 1
print("Errors: {}".format(num_errors))
print(traceback.format_exc())
continue


def save_emails(db, mbox_file):
"""
Import Gmail mbox from google takeout
"""
db["mbox_emails"].upsert_all(
Copy link
Collaborator

@simonw simonw Mar 4, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A fix for the problem I had where my body column ended up being a BLOB rather than text would be to explicitly create the table first.

You can do that like so:

if not db["mbox_emails"].exists():
    db["mbox_emails"].create({
        "id": str,
        "X-GM-THRID": str,
        "X-Gmail-Labels": str,
        "From": str,
        "To": str,
        "Subject": str,
        "when": str,
        "body": str,
    }, pk="id")

I had to upgrade to the latest sqlite-utils for this to work because prior to sqlite-utils 2.0 the table.exists property was a boolean not a method.

(
{
"id": message["Message-Id"],
"X-GM-THRID": message["X-GM-THRID"],
"X-Gmail-Labels": message["X-Gmail-Labels"],
"From": message["From"],
"To": message["To"],
"Subject": message["Subject"],
"when": message["date"],
"body": message["body"],
}
for message in get_mbox(mbox_file)
),
pk="id",
alter=True,
)
print("Finished loading emails into {}.".format(mbox_file))
print('Enabling full text search on "body" and "Subject" fields')
db["mbox_emails"].enable_fts(["body", "Subject"])
print("Finished!")


def get_email_body(message):
"""
return the email body contents
"""
body = None
if message.is_multipart():
for part in message.walk():
if part.is_multipart():
for subpart in part.walk():
if subpart.get_content_type() == "text/plain":
body = subpart.get_payload(decode=True)
elif part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
elif message.get_content_type() == "text/plain":
body = message.get_payload(decode=True)
return body


def get_message_date(get_date, get_from):
if get_date:
mail_date = get_date
else:
mail_date = get_from.strip()[-30:]

datetime_tuple = email.utils.parsedate_tz(mail_date)
if datetime_tuple:
unix_time = email.utils.mktime_tz(datetime_tuple)
mail_date_iso8601 = datetime.datetime.utcfromtimestamp(unix_time).isoformat(" ")
else:
mail_date_iso8601 = ""

return mail_date_iso8601
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_long_description():
[console_scripts]
google-takeout-to-sqlite=google_takeout_to_sqlite.cli:cli
""",
install_requires=["sqlite-utils~=1.11"],
install_requires=["sqlite-utils~=1.11", "rich"],
extras_require={"test": ["pytest"]},
tests_require=["google-takeout-to-sqlite[test]"],
)
147 changes: 147 additions & 0 deletions tests/mbox_contents/small.gmail.mbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
From 1277085061787347926@xxx Tue Aug 05 08:00:23 +0000 2008
X-GM-THRID: 1277085061787347926
X-Gmail-Labels: Unread
Delivered-To: asdfereasdf@gmail.com
Received: by 10.142.98.16 with SMTP id v16cs5204wfb;
Tue, 5 Aug 2008 01:00:23 -0700 (PDT)
Received: by 10.151.26.12 with SMTP id d12mr926013ybj.145.1217923223126;
Tue, 05 Aug 2008 01:00:23 -0700 (PDT)
Return-Path: <fw-general-return-20503-test=gmail.com@lists.zend.com>
Received: from www.zend.com (lists.zend.com [67.15.86.102])
by mx.google.com with SMTP id 6si1509903yxg.6.2008.08.05.01.00.22;
Tue, 05 Aug 2008 01:00:23 -0700 (PDT)
Received-SPF: pass (google.com: domain of fw-general-return-20503-test=gmail.com@lists.zend.com designates 67.15.86.102 as permitted sender) client-ip=67.15.86.102;
Authentication-Results: mx.google.com; spf=pass (google.com: domain of fw-general-return-20503-test=gmail.com@lists.zend.com designates 67.15.86.102 as permitted sender) smtp.mail=fw-general-return-20503-test=gmail.com@lists.zend.com
Received: (qmail 28326 invoked by uid 505); 5 Aug 2008 08:00:15 -0000
Mailing-List: contact fw-general-help@lists.zend.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:fw-general@lists.zend.com>
List-Help: <mailto:fw-general-help@lists.zend.com>
List-Unsubscribe: <mailto:fw-general-unsubscribe@lists.zend.com>
List-Subscribe: <mailto:fw-general-subscribe@lists.zend.com>
Delivered-To: mailing list fw-general@lists.zend.com
Received: (qmail 28319 invoked from network); 5 Aug 2008 08:00:15 -0000
Message-ID: <18826312.post@talk.nabble.com>
Date: Tue, 5 Aug 2008 01:00:12 -0700 (PDT)
From: =?UTF-8?Q?=C5=82_Zieli=C5=84ski?= <personlksdflkj@gmail.com>
To: fw-general@lists.zend.com
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-Nabble-From: personasdflkj@gmail.com
X-pstn-neptune: 0/0/0.00/0
X-pstn-levels: (S: 5.98501/99.90000 CV:99.0000 P:95.9108 M:88.1613 C:98.6951 )
X-pstn-settings: 1 (0.1500:0.1500) cv gt3 gt2 gt1 p m c
X-pstn-addresses: from <personsdflkjer@gmail.com> [638/31]
Subject: [fw-general] Zend_Form and generating fields


Unfortunately it is slow! For 10 products it takes 0.6 sec. to generate.
Is there a better (more efficient) method to build such forms via Zend_Form?

The same I noticed when tried to create a select element which contained
many options (i.e. list of countries). Without ajax (autocomplete) it takes
ages to generate and seems to be useless in this case. Shame.

I wonder if Zend_Form can be used when it comes to generate a lot of
inputs/options in select or I`m forced to create it by hand?



From 1278212428183604564@xxx Sun Aug 17 18:39:23 +0000 2008
X-GM-THRID: 1278204036336346264
X-Gmail-Labels: Unread
Delivered-To: testasdfasdf@gmail.com
Received: by 10.142.98.16 with SMTP id v16cs546946wfb;
Sun, 17 Aug 2008 11:39:24 -0700 (PDT)
Received: by 10.90.100.17 with SMTP id x17mr545483agb.48.1218998363996;
Sun, 17 Aug 2008 11:39:23 -0700 (PDT)
Return-Path: <gnumed-devel-bounces+teslkjlj=gmail.com@gnu.org>
Received: from lists.gnu.org (lists.gnu.org [199.232.76.165])
by mx.google.com with ESMTP id c44si5785715hsc.16.2008.08.17.11.39.23;
Sun, 17 Aug 2008 11:39:23 -0700 (PDT)
Received-SPF: pass (google.com: domain of gnumed-devel-bounces+asdflkjer=gmail.com@gnu.org designates 199.232.76.165 as permitted sender) client-ip=199.232.76.165;
Authentication-Results: mx.google.com; spf=pass (google.com: domain of gnumed-devel-bounces+asdflkjelrkj=gmail.com@gnu.org designates 199.232.76.165 as permitted sender) smtp.mail=gnumed-devel-bounces+asdlkjwer=gmail.com@gnu.org
Received: from localhost ([127.0.0.1]:51303 helo=lists.gnu.org)
by lists.gnu.org with esmtp (Exim 4.43)
id 1KUn9v-0005Uo-D7
for lkwelrkj@gmail.com; Sun, 17 Aug 2008 14:39:23 -0400
Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43)
id 1KUn9s-0005Sa-Ct
for gnumed-devel@gnu.org; Sun, 17 Aug 2008 14:39:20 -0400
Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43)
id 1KUn9q-0005QU-TY
for gnumed-devel@gnu.org; Sun, 17 Aug 2008 14:39:20 -0400
Received: from [199.232.76.173] (port=33439 helo=monty-python.gnu.org)
by lists.gnu.org with esmtp (Exim 4.43) id 1KUn9q-0005Q4-No
for gnumed-devel@gnu.org; Sun, 17 Aug 2008 14:39:18 -0400
Received: from mail.gmx.net ([213.165.64.20]:56165)
by monty-python.gnu.org with smtp (Exim 4.60)
(envelope-from <person.person@example.net>) id 1KUn9q-0008Gg-44
for gnumed-devel@gnu.org; Sun, 17 Aug 2008 14:39:18 -0400
Received: (qmail invoked by alias); 17 Aug 2008 18:39:16 -0000
Received: from A7ee9.a.strato-dslnet.de (EHLO merkur.person.loc)
[89.62.126.233]
by mail.gmx.net (mp066) with SMTP; 17 Aug 2008 20:39:16 +0200
X-Authenticated: #1433807
X-Provags-ID: V01U2FsdGVkX1+dsZZuQ0OdkW7jgzdpHkRRth5+XhDeDDCDx7naLk
3bct2wCsMl/clU
Received: from ncq by merkur.person.loc with local (Exim 4.69)
(envelope-from <person.person@example.net>) id 1KUn9n-0003eu-Ew
for gnumed-devel@gnu.org; Sun, 17 Aug 2008 20:39:15 +0200
Date: Sun, 17 Aug 2008 20:39:15 +0200
From: Person Person <Person.Person@example.net>
To: gnumed-devel@gnu.org
Subject: Re: [Gnumed-devel] Tree view formatting
Message-ID: <20080817183915.GM3992@merkur.person.loc>
Mail-Followup-To: gnumed-devel@gnu.org
References: <272a08710808170925y47484f3fmcfb26f8686727762@mail.gmail.com>
<20080817170549.GH3992@merkur.person.loc>
<272a08710808171034o239f6167q3f7039727802dd09@mail.gmail.com>
<20080817180622.GK3992@merkur.person.loc>
<272a08710808171109x4b6429a1g89f6e594408080db@mail.gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <272a08710808171109x4b6429a1g89f6e594408080db@mail.gmail.com>
User-Agent: Mutt/1.5.18 (2008-05-17)
X-Y-GMX-Trusted: 0
X-FuHaFi: 0.6899999999999999
X-detected-kernel: by monty-python.gnu.org: Genre and OS details not
recognized.
X-BeenThere: gnumed-devel@gnu.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: gnumed-devel.gnu.org
List-Unsubscribe: <http://lists.gnu.org/mailman/listinfo/gnumed-devel>,
<mailto:gnumed-devel-request@gnu.org?subject=unsubscribe>
List-Archive: <http://lists.gnu.org/pipermail/gnumed-devel>
List-Post: <mailto:gnumed-devel@gnu.org>
List-Help: <mailto:gnumed-devel-request@gnu.org?subject=help>
List-Subscribe: <http://lists.gnu.org/mailman/listinfo/gnumed-devel>,
<mailto:gnumed-devel-request@gnu.org?subject=subscribe>
Sender: gnumed-devel-bounces+asdlfkj=gmail.com@gnu.org
Errors-To: gnumed-devel-bounces+asdflkj=gmail.com@gnu.org

On Sun, Aug 17, 2008 at 03:09:55PM -0300, Bob Luz wrote:

> when you say you have changed it ... can I assume it will make the 0.3.0 release
yes

> or is the release READY
I hope it is "ready" so I can release it within the next few
days. I usually wait a few days to see whether any errors
show up. That's why we need you guys to test like mad.

> and all our future discussions on this list
> will from now on to be implemented on the 0.3.1 ?

Not quite yet. And, rather 0.3+.

Person


_______________________________________________
Gnumed-devel mailing list
Gnumed-devel@gnu.org
http://lists.gnu.org/mailman/listinfo/gnumed-devel
51 changes: 51 additions & 0 deletions tests/test_gmail_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from google_takeout_to_sqlite.utils import save_emails
import pathlib
import sqlite_utils


def test_import_gmails():
path = pathlib.Path(__file__).parent / "mbox_contents/small.gmail.mbox"
db = sqlite_utils.Database(memory=True)
save_emails(db, path)
assert "mbox_emails" in set(db.table_names())
mbox_emails = list(sorted(db["mbox_emails"].rows, key=lambda r: r["id"]))
assert [
{
"From": "=?UTF-8?Q?=C5=82_Zieli=C5=84ski?= <personlksdflkj@gmail.com>",
"Subject": "[fw-general] Zend_Form and generating fields",
"To": "fw-general@lists.zend.com",
"X-GM-THRID": "1277085061787347926",
"X-Gmail-Labels": "Unread",
"body": b"\r\nUnfortunately it is slow! For 10 products it takes 0.6 sec. to"
b" generate.\r\nIs there a better (more efficient) method to build s"
b"uch forms via Zend_Form?\r\n\r\nThe same I noticed when tried to"
b" create a select element which contained\r\nmany options (i.e. lis"
b"t of countries). Without ajax (autocomplete) it takes\r\nages to g"
b"enerate and seems to be useless in this case. Shame.\r\n\r\nI wo"
b"nder if Zend_Form can be used when it comes to generate a lot of"
b"\r\ninputs/options in select or I`m forced to create it by han"
b"d?\r\n\r\n\r\n\r\n",
"id": "<18826312.post@talk.nabble.com>",
"when": "2008-08-05 08:00:12",
},
{
"From": "Person Person <Person.Person@example.net>",
"Subject": "Re: [Gnumed-devel] Tree view formatting",
"To": "gnumed-devel@gnu.org",
"X-GM-THRID": "1278204036336346264",
"X-Gmail-Labels": "Unread",
"body": b"On Sun, Aug 17, 2008 at 03:09:55PM -0300, Bob Luz wrote:\r\n\r\n"
b"> when you say you have changed it ... can I assume it will make"
b" the 0.3.0 release\r\nyes\r\n\r\n> or is the release READY\r\nI "
b'hope it is "ready" so I can release it within the next few\r\ndays'
b". I usually wait a few days to see whether any errors\r\nshow up. "
b"That's why we need you guys to test like mad.\r\n\r\n> and all o"
b"ur future discussions on this list\r\n> will from now on to be imp"
b"lemented on the 0.3.1 ?\r\n\r\nNot quite yet. And, rather 0.3+.\r"
b"\n\r\nPerson\r\n\r\n\r\n_________________________________________"
b"______\r\nGnumed-devel mailing list\r\nGnumed-devel@gnu.org\r\nhtt"
b"p://lists.gnu.org/mailman/listinfo/gnumed-devel\r\n",
"id": "<20080817183915.GM3992@merkur.person.loc>",
"when": "2008-08-17 18:39:15",
},
] == mbox_emails