pyro-ppl · dustinvtran · Nov 3, 2017
diff --git a/examples/air/main.py b/examples/air/main.py
@@ -8,11 +8,10 @@
 
 import math
 import os
-import sys
 import time
 import argparse
 from functools import partial
-from subprocess import check_call
+from observations import multi_mnist
 import numpy as np
 
 import torch
@@ -106,13 +105,9 @@
     pyro.set_rng_seed(args.seed)
 
 # Load data.
-infile = './data/multi_mnist_train_uint8.npz'
-if not os.path.exists(infile):
-    print('Running multi_mnist.py to generate dataset at {}...'.format(infile))
-    multi_mnist_py = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'multi_mnist.py')
-    check_call([sys.executable, multi_mnist_py])
-    print('Finished running multi_mnist.py.')
-X_np = np.load(infile)['x'].astype(np.float32)
+inpath = './data'
+(X_np, _), _ = multi_mnist(inpath, max_digits=2, canvas_size=50, seed=42)
+X_np = X_np.astype(np.float32)
 X_np /= 255.0
 X = Variable(torch.from_numpy(X_np))
 X_size = X.size(0)

diff --git a/examples/air/multi_mnist.py b/examples/air/multi_mnist.py
diff --git a/examples/dmm/dmm.py b/examples/dmm/dmm.py
@@ -24,7 +24,7 @@
 from pyro.distributions.transformed_distribution import TransformedDistribution
 import six.moves.cPickle as pickle
 import polyphonic_data_loader as poly
-from os.path import join, dirname, exists
+from os.path import exists
 import argparse
 import time
 from util import get_logger
@@ -258,7 +258,7 @@ def main(args):
     log = get_logger(args.log)
     log(args)
 
-    jsb_file_loc = join(dirname(__file__), "jsb_processed.pkl")
+    jsb_file_loc = "./data/jsb_processed.pkl"
     # ingest training/validation/test data from disk
     data = pickle.load(open(jsb_file_loc, "rb"))
     training_seq_lengths = data['train']['sequence_lengths']

diff --git a/examples/dmm/polyphonic_data_loader.py b/examples/dmm/polyphonic_data_loader.py
@@ -17,34 +17,23 @@
 import torch.nn as nn
 from torch.autograd import Variable
 import numpy as np
-from os.path import join, dirname, exists, abspath
-from six.moves.urllib.request import urlretrieve
+from observations import jsb_chorales
+from os.path import join, exists
 import six.moves.cPickle as pickle
 from pyro.util import ng_zeros
 
 
-# this function downloads the raw data if it hasn't been already
-def download_if_absent(saveas, url):
-
-    if not exists(saveas):
-        print("Couldn't find polyphonic music data at {}".format(saveas))
-        print("downloading polyphonic music data from %s..." % url)
-        urlretrieve(url, saveas)
-
-
 # this function processes the raw data; in particular it unsparsifies it
-def process_data(output="jsb_processed.pkl", rawdata="jsb_raw.pkl",
-                 T_max=160, min_note=21, note_range=88):
-
+def process_data(base_path, filename, T_max=160, min_note=21, note_range=88):
+    output = join(base_path, filename)
     if exists(output):
         return
 
     print("processing raw polyphonic music data...")
-    data = pickle.load(open(rawdata, "rb"))
+    data = jsb_chorales(base_path)
     processed_dataset = {}
-    for split in ['train', 'valid', 'test']:
+    for split, data_split in zip(['train', 'test', 'valid'], data):
         processed_dataset[split] = {}
-        data_split = data[split]
         n_seqs = len(data_split)
         processed_dataset[split]['sequence_lengths'] = np.zeros((n_seqs), dtype=np.int32)
         processed_dataset[split]['sequences'] = np.zeros((n_seqs, T_max, note_range))
@@ -61,11 +50,8 @@ def process_data(output="jsb_processed.pkl", rawdata="jsb_raw.pkl",
 
 
 # this logic will be initiated upon import
-base_loc = dirname(abspath(__file__))
-raw_file = join(base_loc, "jsb_raw.pkl")
-out_file = join(base_loc, "jsb_processed.pkl")
-download_if_absent(raw_file, "http://www-etud.iro.umontreal.ca/~boulanni/JSB%20Chorales.pickle")
-process_data(output=out_file, rawdata=raw_file)
+base_path = './data'
+process_data(base_path, "jsb_processed.pkl")
 
 
 # this function takes a numpy mini-batch and reverses each sequence

diff --git a/setup.py b/setup.py
@@ -16,6 +16,7 @@
         'cloudpickle>=0.3.1',
         'graphviz>=0.8',
         'networkx>=2.0.0',
+        'observations>=0.1.4',
         'torch',
         'six>=1.10.0',
     ],

diff --git a/tutorial/source/air.ipynb b/tutorial/source/air.ipynb
@@ -26,6 +26,7 @@
    "source": [
     "%pylab inline\n",
     "from collections import namedtuple\n",
+    "from observations import multi_mnist\n",
     "import pyro\n",
     "import pyro.optim as optim\n",
     "from pyro.infer import SVI\n",
@@ -55,8 +56,11 @@
    },
    "outputs": [],
    "source": [
-    "fn = '../../examples/air/data/multi_mnist_train_uint8.npz'\n",
-    "mnist = Variable(torch.from_numpy(np.load(fn)['x'].astype(np.float32) / 255.))\n",
+    "inpath = '../../examples/air/data'\n",
+    "(X_np, _), _ = multi_mnist(inpath, max_digits=2, canvas_size=50, seed=42)\n",
+    "X_np = X_np.astype(np.float32)\n",
+    "X_np /= 255.0\n",
+    "X = Variable(torch.from_numpy(X_np))\n",
     "def show_images(imgs):\n",
     "    figure(figsize=(12,4))\n",
     "    for i, img in enumerate(imgs):\n",

diff --git a/tutorial/source/dmm.ipynb b/tutorial/source/dmm.ipynb
@@ -618,7 +618,7 @@
    },
    "outputs": [],
    "source": [
-    "jsb_file_loc = join(dirname(__file__), \"jsb_processed.pkl\")\n",
+    "jsb_file_loc = \"./data/jsb_processed.pkl\"\n",
     "data = pickle.load(open(jsb_file_loc, \"rb\"))\n",
     "training_seq_lengths = data['train']['sequence_lengths']\n",
     "training_data_sequences = data['train']['sequences']\n",