From a03195af7ecc10a855bcf3730de08bd5c26cfec2 Mon Sep 17 00:00:00 2001 From: Leon Hudak <33522493+leohhhn@users.noreply.github.com> Date: Thu, 1 Feb 2024 12:01:26 +0100 Subject: [PATCH] feat: add single file extraction support (#11) * wip fixing tests * add single file support, add tests * var block, remove typo * update err msg * remove newline * simplify if --- extractor/main.go | 36 ++++++++---- extractor/main_test.go | 129 +++++++++++++++++++++++++++++++---------- 2 files changed, 121 insertions(+), 44 deletions(-) diff --git a/extractor/main.go b/extractor/main.go index 9be6c03b..3ae13805 100644 --- a/extractor/main.go +++ b/extractor/main.go @@ -32,9 +32,9 @@ var ( // Define extractor config type extractorCfg struct { - fileType string - sourceDir string - outputDir string + fileType string + sourcePath string + outputDir string } func main() { @@ -74,10 +74,10 @@ func (c *extractorCfg) registerFlags(fs *flag.FlagSet) { ) fs.StringVar( - &c.sourceDir, - "source-dir", - ".", - "the root folder containing transaction data", + &c.sourcePath, + "source-path", + "", + "the source file or folder containing transaction data", ) fs.StringVar( @@ -96,7 +96,7 @@ func execExtract(ctx context.Context, cfg *extractorCfg) error { } // Check the source dir is valid - if cfg.sourceDir == "" { + if cfg.sourcePath == "" { return errInvalidSourceDir } @@ -105,10 +105,22 @@ func execExtract(ctx context.Context, cfg *extractorCfg) error { return errInvalidOutputDir } - // Find the files that need to be analyzed - sourceFiles, findErr := findFilePaths(cfg.sourceDir, cfg.fileType) - if findErr != nil { - return fmt.Errorf("unable to find file paths, %w", findErr) + // Check if source is valid + source, err := os.Stat(cfg.sourcePath) + if err != nil { + return fmt.Errorf("unable to stat source path, %w", err) + } + + var sourceFiles []string + sourceFiles = append(sourceFiles, cfg.sourcePath) + + // If source is dir, walk it and add to sourceFiles + if source.IsDir() { + var findErr error + sourceFiles, findErr = findFilePaths(cfg.sourcePath, cfg.fileType) + if findErr != nil { + return fmt.Errorf("unable to find file paths, %w", findErr) + } } if len(sourceFiles) == 0 { diff --git a/extractor/main_test.go b/extractor/main_test.go index c0f157d6..635fc7fc 100644 --- a/extractor/main_test.go +++ b/extractor/main_test.go @@ -23,14 +23,7 @@ import ( "time" ) -const ( - numSourceFiles = 20 - numTx = 100 - numMsg = 200 - msgPerTx = numMsg / numTx - txPerSourceFile = numTx / numSourceFiles - sourceFileType = ".log" -) +const sourceFileType = ".jsonl" func TestExtractor_Errors(t *testing.T) { testTable := []struct { @@ -41,36 +34,36 @@ func TestExtractor_Errors(t *testing.T) { { "no source files", &extractorCfg{ - fileType: ".log", - sourceDir: "./", - outputDir: ".", + fileType: ".log", + sourcePath: "./", + outputDir: ".", }, errNoSourceFilesFound, }, { "invalid filetype", &extractorCfg{ - fileType: "", - sourceDir: ".", - outputDir: ".", + fileType: "", + sourcePath: ".", + outputDir: ".", }, errInvalidFileType, }, { "invalid source dir", &extractorCfg{ - fileType: ".log", - sourceDir: "", - outputDir: ".", + fileType: ".log", + sourcePath: "", + outputDir: ".", }, errInvalidSourceDir, }, { "invalid output dir", &extractorCfg{ - fileType: ".log", - sourceDir: ".", - outputDir: "", + fileType: ".log", + sourcePath: ".", + outputDir: "", }, errInvalidOutputDir, }, @@ -91,29 +84,97 @@ func TestExtractor_Errors(t *testing.T) { } } -func TestValidFlow(t *testing.T) { +func TestValidFlow_Dir(t *testing.T) { + t.Parallel() + + // Generate temporary output dir + outputDir, err := os.MkdirTemp(".", "outputDir") + require.NoError(t, err) + t.Cleanup(removeDir(t, outputDir)) + + // Generate temporary source dir + sourceDir, err := os.MkdirTemp(".", "sourceDir") + require.NoError(t, err) + t.Cleanup(removeDir(t, sourceDir)) + + // Set correct config + var cfg = &extractorCfg{ + fileType: sourceFileType, + sourcePath: sourceDir, + outputDir: outputDir, + } + + // Generate mock messages & mock files + mockStdMsg, mockAddPkgMsg := generateMockMsgs(t) + _ = generateSourceFiles(t, sourceDir, mockStdMsg, 20) + + // Perform extraction + ctx, cancelFn := context.WithTimeout(context.Background(), time.Second*5) + defer cancelFn() + + require.NoError(t, execExtract(ctx, cfg)) + + for _, msg := range mockAddPkgMsg { + basePath := filepath.Join(outputDir, strings.TrimLeft(msg.Package.Path, "gno.land/")) + + // Get metadata path & open metadata file + metadataPath := filepath.Join(basePath, packageMetadataFile) + file, err := os.Open(metadataPath) + require.NoError(t, err) + + // Read Metadata + reader := bufio.NewReader(file) + retrievedMetadata, _, err := reader.ReadLine() + require.NoError(t, err) + + // Compare metadata + expectedMetadata, err := json.Marshal(metadataFromMsg(msg)) + assert.Equal(t, expectedMetadata, retrievedMetadata) + + // Close metadata file + require.NoError(t, file.Close()) + + // Check package file content + for _, f := range msg.Package.Files { + filePath := filepath.Join(basePath, f.Name) + + // Open file + file, err := os.Open(filePath) + require.NoError(t, err) + + // Read file body + reader := bufio.NewReader(file) + retrievedFileBody, _, err := reader.ReadLine() + + // Compare file bodies + assert.Equal(t, f.Body, string(retrievedFileBody)) + } + } +} + +func TestValidFlow_File(t *testing.T) { t.Parallel() // Generate temporary output dir - outputDir, err := os.MkdirTemp(".", "output") + outputDir, err := os.MkdirTemp(".", "outputDir") require.NoError(t, err) t.Cleanup(removeDir(t, outputDir)) // Generate temporary source dir - sourceDir, err := os.MkdirTemp(".", "source") + sourceDir, err := os.MkdirTemp(".", "sourceDir") require.NoError(t, err) t.Cleanup(removeDir(t, sourceDir)) // Set correct config var cfg = &extractorCfg{ - fileType: sourceFileType, - sourceDir: sourceDir, - outputDir: outputDir, + fileType: sourceFileType, + sourcePath: sourceDir, + outputDir: outputDir, } // Generate mock messages & mock files mockStdMsg, mockAddPkgMsg := generateMockMsgs(t) - _ = generateSourceFiles(t, sourceDir, mockStdMsg) + _ = generateSourceFiles(t, sourceDir, mockStdMsg, 1) // Perform extraction ctx, cancelFn := context.WithTimeout(context.Background(), time.Second*5) @@ -166,6 +227,7 @@ func TestFindFilePaths(t *testing.T) { require.NoError(t, err) t.Cleanup(removeDir(t, tempDir)) + numSourceFiles := 20 testFiles := make([]string, numSourceFiles) for i := 0; i < numSourceFiles; i++ { @@ -181,7 +243,7 @@ func TestFindFilePaths(t *testing.T) { require.NoError(t, err) } - results, err := findFilePaths(tempDir, ".log") + results, err := findFilePaths(tempDir, sourceFileType) require.NoError(t, err) expectedResults := make([]string, 0, len(testFiles)) @@ -215,7 +277,7 @@ func TestExtractAddMessages(t *testing.T) { t.Cleanup(removeDir(t, tempDir)) mockMsgs, mockMsgsAddPackage := generateMockMsgs(t) - sourceFiles := generateSourceFiles(t, tempDir, mockMsgs) + sourceFiles := generateSourceFiles(t, tempDir, mockMsgs, 20) var results []vm.MsgAddPackage for _, sf := range sourceFiles { @@ -306,12 +368,14 @@ func TestWritePackageFiles(t *testing.T) { } // Helpers -func generateSourceFiles(t *testing.T, dir string, mockMsgs []std.Msg) []string { +func generateSourceFiles(t *testing.T, dir string, mockMsgs []std.Msg, numSourceFiles int) []string { t.Helper() var ( - mockTx = make([]std.Tx, numTx) - testFiles = make([]string, numSourceFiles) + txPerSourceFile = 5 + mockTx = make([]std.Tx, txPerSourceFile*numSourceFiles) + testFiles = make([]string, numSourceFiles) + msgPerTx = len(mockMsgs) / len(mockTx) ) // Generate transactions to wrap messages @@ -368,6 +432,7 @@ func generateMockMsgs(t *testing.T) ([]std.Msg, []vm.MsgAddPackage) { var ret []std.Msg var addPkgRet []vm.MsgAddPackage + numMsg := 100 pkgID := 0