Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filer.Filer to read notebooks from WSFS without omitting their extension #1457

Merged
merged 30 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0229dd3
wip
shreyas-goenka May 27, 2024
add2dd1
do not rely on fuse mount
shreyas-goenka May 27, 2024
fdacdcc
cleanup some todos and add test for dup name error
shreyas-goenka May 28, 2024
6ab53d5
more modular extension business
shreyas-goenka May 28, 2024
49a0b49
rename file
shreyas-goenka May 28, 2024
42e1bb5
add test for language check
shreyas-goenka May 28, 2024
b55c00e
better main documentation
shreyas-goenka May 28, 2024
f80d338
test for directories + notebook clash
shreyas-goenka May 28, 2024
c75b5e2
clear todos
shreyas-goenka May 28, 2024
a38295c
-
shreyas-goenka May 28, 2024
b514498
better error messagE
shreyas-goenka May 28, 2024
0fb56da
Merge remote-tracking branch 'origin' into wsfs-filer
shreyas-goenka May 28, 2024
deb2107
remove unnecessary sort
shreyas-goenka May 29, 2024
747ec5b
make multiline
shreyas-goenka May 29, 2024
c9a72c5
address more comments
shreyas-goenka May 29, 2024
8db2bd7
-
shreyas-goenka May 29, 2024
1324bba
address comments and fix the ipynb information loss:
shreyas-goenka May 29, 2024
4fec905
do not use error directive
shreyas-goenka May 29, 2024
4fe597f
final cleanup
shreyas-goenka May 29, 2024
628a592
Break out function for stat call
pietern May 30, 2024
8bd6449
Return error if we expect a notebook and it isn't; never return a nil…
pietern May 30, 2024
72538cf
Capitalize
pietern May 30, 2024
0b5d9f5
Move test assertions around; capitalize
pietern May 30, 2024
45786de
Download Jupyter notebook as IPYNB
pietern May 30, 2024
6ca5bce
Rename to be similar to workspace files client
pietern May 30, 2024
5af3855
Fix Jupyter contents check
pietern May 30, 2024
d219f13
Address comments
pietern May 30, 2024
dbeee88
Bubble up errors from stat calls
pietern May 30, 2024
55eaa2d
filer -> wsfs
pietern May 30, 2024
55662ce
Fix comment
pietern May 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
289 changes: 289 additions & 0 deletions internal/filer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ import (
"bytes"
"context"
"errors"
"fmt"
"io"
"io/fs"
"path"
"regexp"
"slices"
"strings"
"testing"

Expand Down Expand Up @@ -37,6 +40,11 @@ func (f filerTest) assertContents(ctx context.Context, name string, contents str
assert.Equal(f, contents, body.String())
}

func (f filerTest) assertNotExists(ctx context.Context, name string) {
_, err := f.Stat(ctx, name)
assert.ErrorIs(f, err, fs.ErrNotExist)
}

func commonFilerRecursiveDeleteTest(t *testing.T, ctx context.Context, f filer.Filer) {
var err error

Expand Down Expand Up @@ -94,6 +102,7 @@ func TestAccFilerRecursiveDelete(t *testing.T) {
{"workspace files", setupWsfsFiler},
{"dbfs", setupDbfsFiler},
{"files", setupUcVolumesFiler},
{"workspace fuse", setupWsfsFuseFiler},
} {
tc := testCase

Expand Down Expand Up @@ -204,6 +213,7 @@ func TestAccFilerReadWrite(t *testing.T) {
{"workspace files", setupWsfsFiler},
{"dbfs", setupDbfsFiler},
{"files", setupUcVolumesFiler},
{"workspace fuse", setupWsfsFuseFiler},
} {
tc := testCase

Expand Down Expand Up @@ -312,6 +322,7 @@ func TestAccFilerReadDir(t *testing.T) {
{"workspace files", setupWsfsFiler},
{"dbfs", setupDbfsFiler},
{"files", setupUcVolumesFiler},
{"workspace fuse", setupWsfsFuseFiler},
} {
tc := testCase

Expand Down Expand Up @@ -462,3 +473,281 @@ func TestAccFilerWorkspaceNotebookWithOverwriteFlag(t *testing.T) {
filerTest{t, f}.assertContents(ctx, "scalaNb", "// Databricks notebook source\n println(\"second upload\"))")
filerTest{t, f}.assertContents(ctx, "jupyterNb", "# Databricks notebook source\nprint(\"Jupyter Notebook Version 2\")")
}

func TestAccFilerWorkspaceFuseReadDir(t *testing.T) {
files := []struct {
name string
content string
}{
{"pyNb.py", "# Databricks notebook source\nprint('first upload'))"},
{"rNb.r", "# Databricks notebook source\nprint('first upload'))"},
{"sqlNb.sql", "-- Databricks notebook source\n SELECT \"first upload\""},
{"scalaNb.scala", "// Databricks notebook source\n println(\"first upload\"))"},
{"jupyterNb.ipynb", jupyterNotebookContent1},
{"jupyterNb2.ipynb", jupyterNotebookContent2},
{"dir1/dir2/dir3/file.txt", "file content"},
{"foo.py", "print('foo')"},
{"foo.r", "print('foo')"},
{"foo.sql", "SELECT 'foo'"},
{"foo.scala", "println('foo')"},
}

ctx := context.Background()
wf, _ := setupWsfsFuseFiler(t)

for _, f := range files {
err := wf.Write(ctx, f.name, strings.NewReader(f.content), filer.CreateParentDirectories)
require.NoError(t, err)
}

// Read entries
entries, err := wf.ReadDir(ctx, ".")
require.NoError(t, err)
assert.Len(t, entries, len(files))
names := []string{}
for _, e := range entries {
names = append(names, e.Name())
}
slices.Sort(names)
shreyas-goenka marked this conversation as resolved.
Show resolved Hide resolved
assert.Equal(t, []string{"dir1", "foo.py", "foo.r", "foo.scala", "foo.sql",
"jupyterNb.py", "jupyterNb2.py", "pyNb.py", "rNb.r", "scalaNb.scala", "sqlNb.sql"}, names)
shreyas-goenka marked this conversation as resolved.
Show resolved Hide resolved
}

func TestAccFilerWorkspaceFuseRead(t *testing.T) {
files := []struct {
name string
content string
}{
{"foo.py", "# Databricks notebook source\nprint('first upload'))"},
{"bar.py", "print('foo')"},
{"pretender", "not a notebook"},
{"dir/file.txt", "file content"},
{"scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')"},
}

ctx := context.Background()
wf, _ := setupWsfsFuseFiler(t)

for _, f := range files {
err := wf.Write(ctx, f.name, strings.NewReader(f.content), filer.CreateParentDirectories)
require.NoError(t, err)
}

// Read contents
filerTest{t, wf}.assertContents(ctx, "foo.py", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, wf}.assertContents(ctx, "bar.py", "print('foo')")
filerTest{t, wf}.assertContents(ctx, "dir/file.txt", "file content")
filerTest{t, wf}.assertContents(ctx, "scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')")
filerTest{t, wf}.assertContents(ctx, "pretender", "not a notebook")

// Read non-existent file
_, err := wf.Read(ctx, "non-existent.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

// Ensure we do not read a file as a notebook
_, err = wf.Read(ctx, "pretender.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

_, err = wf.Read(ctx, "pretender.ipynb")
assert.ErrorIs(t, err, fs.ErrNotExist)

// Read directory
_, err = wf.Read(ctx, "dir")
assert.ErrorIs(t, err, fs.ErrInvalid)

// Ensure we do not read a scala notebook as a python notebook
_, err = wf.Read(ctx, "scala-notebook.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
}

func TestAccFilerWorkspaceFuseDelete(t *testing.T) {
files := []struct {
name string
content string
}{
{"foo.py", "# Databricks notebook source\nprint('first upload'))"},
{"bar.py", "print('foo')"},
{"pretender", "not a notebook"},
{"dir/file.txt", "file content"},
{"scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')"},
}

ctx := context.Background()
wf, _ := setupWsfsFuseFiler(t)

for _, f := range files {
err := wf.Write(ctx, f.name, strings.NewReader(f.content), filer.CreateParentDirectories)
require.NoError(t, err)
}

// Read contents of test fixtures as a sanity check
filerTest{t, wf}.assertContents(ctx, "foo.py", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, wf}.assertContents(ctx, "bar.py", "print('foo')")
filerTest{t, wf}.assertContents(ctx, "dir/file.txt", "file content")
filerTest{t, wf}.assertContents(ctx, "scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')")
filerTest{t, wf}.assertContents(ctx, "pretender", "not a notebook")

// Delete notebook
err := wf.Delete(ctx, "foo.py")
require.NoError(t, err)
filerTest{t, wf}.assertNotExists(ctx, "foo.py")

// Delete file
err = wf.Delete(ctx, "bar.py")
require.NoError(t, err)
filerTest{t, wf}.assertNotExists(ctx, "bar.py")

// Delete non-existent file
err = wf.Delete(ctx, "non-existent.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

// Ensure we do not delete a file as a notebook
err = wf.Delete(ctx, "pretender.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

// Ensure we do not delete a scala notebook as a python notebook
_, err = wf.Read(ctx, "scala-notebook.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

// Delete directory
err = wf.Delete(ctx, "dir")
assert.ErrorIs(t, err, fs.ErrInvalid)

// Delete directory recursively
err = wf.Delete(ctx, "dir", filer.DeleteRecursively)
require.NoError(t, err)
filerTest{t, wf}.assertNotExists(ctx, "dir")
}

func TestAccFilerWorkspaceFuseStat(t *testing.T) {
files := []struct {
name string
content string
}{
{"foo.py", "# Databricks notebook source\nprint('first upload'))"},
{"bar.py", "print('foo')"},
{"dir/file.txt", "file content"},
{"pretender", "not a notebook"},
{"scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')"},
shreyas-goenka marked this conversation as resolved.
Show resolved Hide resolved
}

ctx := context.Background()
wf, _ := setupWsfsFuseFiler(t)

for _, f := range files {
err := wf.Write(ctx, f.name, strings.NewReader(f.content), filer.CreateParentDirectories)
require.NoError(t, err)
}

// Read contents of test fixtures as a sanity check
filerTest{t, wf}.assertContents(ctx, "foo.py", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, wf}.assertContents(ctx, "bar.py", "print('foo')")
filerTest{t, wf}.assertContents(ctx, "dir/file.txt", "file content")
filerTest{t, wf}.assertContents(ctx, "scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')")
filerTest{t, wf}.assertContents(ctx, "pretender", "not a notebook")

// Stat on a notebook
info, err := wf.Stat(ctx, "foo.py")
require.NoError(t, err)
assert.Equal(t, "foo.py", info.Name())
assert.False(t, info.IsDir())

// Stat on a file
info, err = wf.Stat(ctx, "bar.py")
require.NoError(t, err)
assert.Equal(t, "bar.py", info.Name())
assert.False(t, info.IsDir())

// Stat on a directory
info, err = wf.Stat(ctx, "dir")
require.NoError(t, err)
assert.Equal(t, "dir", info.Name())
assert.True(t, info.IsDir())

// Stat on a non-existent file
_, err = wf.Stat(ctx, "non-existent.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

// Ensure we do not stat a file as a notebook
_, err = wf.Stat(ctx, "pretender.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

// Ensure we do not stat a scala notebook as a python notebook
_, err = wf.Stat(ctx, "scala-notebook.py")
assert.ErrorIs(t, err, fs.ErrNotExist)

_, err = wf.Stat(ctx, "pretender.ipynb")
assert.ErrorIs(t, err, fs.ErrNotExist)
}

func TestAccFilerWorkspaceFuseErrorsOnDupName(t *testing.T) {
tcases := []struct {
files []struct{ name, content string }
name string
}{
{
name: "python",
files: []struct{ name, content string }{
{"foo.py", "print('foo')"},
{"foo.py", "# Databricks notebook source\nprint('foo')"},
},
},
{
name: "r",
files: []struct{ name, content string }{
{"foo.r", "print('foo')"},
{"foo.r", "# Databricks notebook source\nprint('foo')"},
},
},
{
name: "sql",
files: []struct{ name, content string }{
{"foo.sql", "SELECT 'foo'"},
{"foo.sql", "-- Databricks notebook source\nSELECT 'foo'"},
},
},
{
name: "scala",
files: []struct{ name, content string }{
{"foo.scala", "println('foo')"},
{"foo.scala", "// Databricks notebook source\nprintln('foo')"},
},
},
{
name: "jupyter",
files: []struct{ name, content string }{
{"foo.py", "print('foo')"},
{"foo.ipynb", jupyterNotebookContent1},
},
},
}

for _, tc := range tcases {
t.Run(tc.name, func(t *testing.T) {
ctx := context.Background()
wf, tmpDir := setupWsfsFuseFiler(t)

for _, f := range tc.files {
err := wf.Write(ctx, f.name, strings.NewReader(f.content), filer.CreateParentDirectories)
require.NoError(t, err)
}

_, err := wf.ReadDir(ctx, ".")
assert.ErrorAs(t, err, &filer.DuplicatePathError{})
assert.ErrorContains(t, err, fmt.Sprintf("cannot read files from the workspace file system by emulating FUSE. Duplicate paths encountered. Both NOTEBOOK at %s and FILE at %s resolve to the same name %s", path.Join(tmpDir, "foo"), path.Join(tmpDir, tc.files[0].name), tc.files[0].name))
shreyas-goenka marked this conversation as resolved.
Show resolved Hide resolved
})
}
}

func TestAccWorkspaceFuseDirectoriesAreNotNotebooks(t *testing.T) {
ctx := context.Background()
wf, _ := setupWsfsFuseFiler(t)

// Create a directory with an extension
err := wf.Mkdir(ctx, "foo")
require.NoError(t, err)

// Reading foo.py should fail. foo is a directory, not a notebook.
_, err = wf.Read(ctx, "foo.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
}
11 changes: 11 additions & 0 deletions internal/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,17 @@ func setupWsfsFiler(t *testing.T) (filer.Filer, string) {
return f, tmpdir
}

func setupWsfsFuseFiler(t *testing.T) (filer.Filer, string) {
t.Log(GetEnvOrSkipTest(t, "CLOUD_ENV"))

w := databricks.Must(databricks.NewWorkspaceClient())
tmpdir := TemporaryWorkspaceDir(t, w)
f, err := filer.NewWorkspaceFuseClient(w, tmpdir)
require.NoError(t, err)

return f, tmpdir
}

func setupDbfsFiler(t *testing.T) (filer.Filer, string) {
t.Log(GetEnvOrSkipTest(t, "CLOUD_ENV"))

Expand Down
Loading
Loading