Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pickle dataframes for fast pandas reading. #85

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Downloaded files
download

# Jupyter Notebooks
.ipynb_checkpoints

# Large downloaded data files
download/

# Mac metadata
.DS_Store

Expand Down
21 changes: 17 additions & 4 deletions 1.download.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
"outputs": [],
"source": [
"import os\n",
"import mimetypes\n",
"from urllib.request import urlretrieve\n",
"\n",
"import pandas\n",
"import requests"
]
},
Expand Down Expand Up @@ -99,28 +101,39 @@
"Downloading https://ndownloader.figshare.com/files/7311959 to `samples.tsv`\n",
"Downloading https://ndownloader.figshare.com/files/7311956 to `covariates.tsv`\n",
"Downloading https://ndownloader.figshare.com/files/7311950 to `expression-matrix.tsv.bz2`\n",
" - converting `expression-matrix.tsv.bz2` to a pickled dataframe\n",
"Downloading https://ndownloader.figshare.com/files/7311962 to `expression-genes.tsv`\n",
"Downloading https://ndownloader.figshare.com/files/7311953 to `mutation-matrix.tsv.bz2`\n",
" - converting `mutation-matrix.tsv.bz2` to a pickled dataframe\n",
"Downloading https://ndownloader.figshare.com/files/7311965 to `mutation-genes.tsv`\n"
]
}
],
"source": [
"# Download the files specified by the metadata\n",
"for file_info in response['files']:\n",
" # Download file\n",
" url = file_info['download_url']\n",
" name = file_info['name']\n",
" print('Downloading {} to `{}`'.format(url, name))\n",
" path = os.path.join('download', name)\n",
" urlretrieve(url, path)"
" urlretrieve(url, path)\n",
" \n",
" # Export compressed files to xzipped pickles\n",
" type_, encoding = mimetypes.guess_type(name)\n",
" if type_ == 'text/tab-separated-values' and encoding:\n",
" print(' - converting `{}` to a pickled dataframe'.format(name))\n",
" df = pandas.read_table(path, index_col=0)\n",
" bare_path = path.rsplit('.tsv', 1)[0]\n",
" pkl_path = bare_path + '.pkl'\n",
" df.to_pickle(pkl_path)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "cognoma-machine-learning",
"language": "python",
"name": "python3"
"name": "cognoma-machine-learning"
},
"language_info": {
"codemirror_mode": {
Expand Down
302 changes: 141 additions & 161 deletions 2.TCGA-MLexample.ipynb

Large diffs are not rendered by default.

442 changes: 211 additions & 231 deletions 3.TCGA-MLexample_Pathway.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions execute.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@
# Exit on error
set -o errexit

source activate cognoma-machine-learning

jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 *.ipynb
jupyter nbconvert --to=script --FilesWriter.build_directory=scripts *.ipynb
13 changes: 12 additions & 1 deletion scripts/1.download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
# In[1]:

import os
import mimetypes
from urllib.request import urlretrieve

import pandas
import requests


Expand Down Expand Up @@ -42,11 +44,20 @@

# In[6]:

# Download the files specified by the metadata
for file_info in response['files']:
# Download file
url = file_info['download_url']
name = file_info['name']
print('Downloading {} to `{}`'.format(url, name))
path = os.path.join('download', name)
urlretrieve(url, path)

# Export compressed files to xzipped pickles
type_, encoding = mimetypes.guess_type(name)
if type_ == 'text/tab-separated-values' and encoding:
print(' - converting `{}` to a pickled dataframe'.format(name))
df = pandas.read_table(path, index_col=0)
bare_path = path.rsplit('.tsv', 1)[0]
pkl_path = bare_path + '.pkl'
df.to_pickle(pkl_path)

6 changes: 4 additions & 2 deletions scripts/2.TCGA-MLexample.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@

# In[4]:

get_ipython().run_cell_magic('time', '', "path = os.path.join('download', 'expression-matrix.tsv.bz2')\nX = pd.read_table(path, index_col=0)")
path = os.path.join('download', 'expression-matrix.pkl')
X = pd.read_pickle(path)


# In[5]:

get_ipython().run_cell_magic('time', '', "path = os.path.join('download', 'mutation-matrix.tsv.bz2')\nY = pd.read_table(path, index_col=0)")
path = os.path.join('download', 'mutation-matrix.pkl')
Y = pd.read_pickle(path)


# In[6]:
Expand Down
6 changes: 4 additions & 2 deletions scripts/3.TCGA-MLexample_Pathway.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,14 @@

# In[8]:

get_ipython().run_cell_magic('time', '', "path = os.path.join('download', 'expression-matrix.tsv.bz2')\nX = pd.read_table(path, index_col=0)")
path = os.path.join('download', 'expression-matrix.pkl')
X = pd.read_pickle(path)


# In[9]:

get_ipython().run_cell_magic('time', '', "path = os.path.join('download', 'mutation-matrix.tsv.bz2')\nY = pd.read_table(path, index_col=0)")
path = os.path.join('download', 'mutation-matrix.pkl')
Y = pd.read_pickle(path)


# In[10]:
Expand Down