diff --git a/docs/source/user_guide/fotw/fotw-001-images.ipynb b/docs/source/user_guide/fotw/fotw-001-images.ipynb index 924321f5d7..80fda030b8 100644 --- a/docs/source/user_guide/fotw/fotw-001-images.ipynb +++ b/docs/source/user_guide/fotw/fotw-001-images.ipynb @@ -12,6 +12,43 @@ "This tutorial will introduce you to the canonical ways of **working with images in Daft**." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f6c903e", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "CI = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2f4fa17", + "metadata": {}, + "outputs": [], + "source": [ + "# Skip this notebook execution in CI because it uses torch and private buckets\n", + "if CI:\n", + " import sys\n", + " sys.exit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62c617e9", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install -U torch torchvision" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -51,20 +88,23 @@ "## 1. Loading Image Paths\n", "Let's say we have some data about dogs and their owners, including images of the dogs, and we want to classify the breeds of these dogs.\n", "\n", - "The tabular data about owners' and dogs names can all be stored in a DataFrame, along with the paths to the images (either local or remote URLs).\n", - "\n", - "### Storing Images Locally\n", - "To work with images stored at local paths, use the `from_glob_path` method:" + "The tabular data about owners' and dogs names can all be stored in a DataFrame, along with the paths to the images (either local or remote URLs)." ] }, { "cell_type": "code", - "execution_count": 3, - "id": "a74e6280-1dab-42b5-a4e9-7da9e6c1d6bf", + "execution_count": 4, + "id": "5b6d44d6-fe28-44dc-b03c-5c3f3c0ac684", "metadata": {}, "outputs": [], "source": [ - "df = daft.from_glob_path(\"path/to/images.jpg\") #substitute with a path to your local img directory" + "from daft.io import IOConfig, S3Config\n", + "\n", + "io_config = IOConfig(\n", + " s3=S3Config(\n", + " region_name=\"eu-north-1\",\n", + " )\n", + ")" ] }, { @@ -72,6 +112,10 @@ "id": "5b5b9a10-5831-4ad5-9b42-a8fe3faabcc0", "metadata": {}, "source": [ + "### Storing Images in Cloud Object Store\n", + "\n", + "You can also use `from_glob_path` to load images from paths to remote object stores, like an S3 bucket.\n", + "\n", "This method creates a DataFrame from a collection of file paths. \n", "\n", "`from_glob_path` supports wildcards:\n", @@ -80,24 +124,7 @@ "- `[…]` matches any single character in the brackets\n", "- `**` recursively matches any number of layers of directories\n", "\n", - "### Storing Images in Cloud Object Store\n", - "You can also use `from_glob_path` to load images from paths to remote object stores, like an S3 bucket:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "5b6d44d6-fe28-44dc-b03c-5c3f3c0ac684", - "metadata": {}, - "outputs": [], - "source": [ - "from daft.io import IOConfig, S3Config\n", - "\n", - "io_config = IOConfig(\n", - " s3=S3Config(\n", - " region_name=\"eu-north-1\",\n", - " )\n", - ")" + "It supports reading from local filesystems, S3 file-systems and other cloud-based storage." ] }, { @@ -149,7 +176,7 @@ ], "source": [ "df = daft.from_glob_path(\n", - " \"s3://avriiil/images-dogs/*.jpg\", #substitute with a path to your own private bucket\n", + " \"s3://avriiil/images-dogs/*.jpg\", # substitute with a path to your own private bucket\n", " io_config=io_config\n", ")\n", "df.show()" @@ -169,7 +196,8 @@ "metadata": {}, "source": [ "### Storing Images at URLs\n", - "Your images can also be stored at a stable URL.\n", + "\n", + "Your images can also be stored at a stable HTTP URL. Commonly, this is something such as a CDN like flicker's.\n", "\n", "In that case, we can create a Daft DataFrame containing the data as follows:" ] @@ -234,9 +262,11 @@ " ],\n", " \"dog_name\": [\"Ernie\", \"Jackie\", \"Wolfie\", \"Shaggie\", \"Zadie\"],\n", " \"urls\": [\n", - "\"https://live.staticflickr.com/65535/53671838774_03ba68d203_o.jpg\",\n", - "\"https://live.staticflickr.com/65535/53671700073_2c9441422e_o.jpg\", \"https://live.staticflickr.com/65535/53670606332_1ea5f2ce68_o.jpg\",\n", - "\"https://live.staticflickr.com/65535/53671838039_b97411a441_o.jpg\", \"https://live.staticflickr.com/65535/53671698613_0230f8af3c_o.jpg\",\n", + " \"https://live.staticflickr.com/65535/53671838774_03ba68d203_o.jpg\",\n", + " \"https://live.staticflickr.com/65535/53671700073_2c9441422e_o.jpg\",\n", + " \"https://live.staticflickr.com/65535/53670606332_1ea5f2ce68_o.jpg\",\n", + " \"https://live.staticflickr.com/65535/53671838039_b97411a441_o.jpg\",\n", + " \"https://live.staticflickr.com/65535/53671698613_0230f8af3c_o.jpg\",\n", " ],\n", " }\n", ")\n", @@ -505,18 +535,6 @@ "First, make sure to install and import some extra dependencies:" ] }, - { - "cell_type": "code", - "execution_count": 10, - "id": "339685ab-feca-4d00-8fb9-6cc7204d486b", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#!pip install -U validators matplotlib torch torchvision" - ] - }, { "cell_type": "code", "execution_count": 11, @@ -526,13 +544,9 @@ "source": [ "# import additional libraries, these are necessary for PyTorch\n", "import torch\n", - "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import warnings\n", "from PIL import Image\n", - "from daft import udf, DataType\n", - "warnings.filterwarnings('ignore')\n", - "%matplotlib inline" + "from daft import udf, DataType" ] }, { @@ -601,7 +615,7 @@ "metadata": {}, "outputs": [], "source": [ - "from torchvision import models, transforms\n", + "from torchvision import transforms\n", "\n", "def transform_image(image):\n", " # img = Image.fromarray(image)\n",