-
Notifications
You must be signed in to change notification settings - Fork 102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[UDD-35] Concurrent Document Extraction Eexample #255
Merged
sampila
merged 5 commits into
unidoc:master
from
kellemNegasi:udd-35-concurrent-extraction-example
Jun 11, 2024
Merged
Changes from 2 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
2396e39
added concurrent document extraction example
kellemNegasi 87cca20
made minor updates and added comments
kellemNegasi a2acb44
made minor change
kellemNegasi 810c382
added README file
kellemNegasi 07577e8
updated buffer length
kellemNegasi File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
/* | ||
* This example demonstrates how to extract content form multiple documents concurrently | ||
* with each document extraction running in its own go routine. | ||
* N.B. currently concurrency is supported on a document level which means we can only extract one document per go routine. | ||
* | ||
* Run as: go run concurrent_extraction.go <input1.pdf> <input2.pdf> <input3.pdf>... <output_dir> | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"fmt" | ||
"io/fs" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
"time" | ||
|
||
"github.com/unidoc/unipdf/v3/common/license" | ||
"github.com/unidoc/unipdf/v3/extractor" | ||
"github.com/unidoc/unipdf/v3/model" | ||
) | ||
|
||
func init() { | ||
// Make sure to load your metered License API key prior to using the library. | ||
// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io | ||
err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
|
||
func main() { | ||
if len(os.Args) < 3 { | ||
fmt.Printf("Usage: go run concurrent_extraction.go input1.pdf input2.pdf input3.pdf... output_dir\n") | ||
os.Exit(1) | ||
} | ||
inputDocuments := []string{} | ||
args := os.Args | ||
var outputDir string | ||
for i := 1; i < len(args); i++ { | ||
if i == len(args)-1 { | ||
outputDir = args[i] | ||
} else { | ||
inputDocuments = append(inputDocuments, args[i]) | ||
} | ||
} | ||
|
||
start := time.Now() | ||
runConcurrent(inputDocuments, outputDir) | ||
duration := time.Since(start) | ||
fmt.Println("time taken for concurrent extraction", duration) | ||
} | ||
|
||
// runConcurrent takes the list of input documents and destination output directory and runs the extraction concurrently. | ||
func runConcurrent(documents []string, outputDir string) { | ||
res := make(chan map[string]string, 3) | ||
|
||
err := concurrentExtraction(documents, res) | ||
if err != nil { | ||
fmt.Printf("Error. extraction failed %v.\n", err) | ||
} | ||
outputPath := outputDir | ||
if _, err := os.Stat(outputPath); err != nil { | ||
if os.IsNotExist(err) { | ||
err := os.Mkdir(outputPath, fs.ModePerm) | ||
if err != nil { | ||
fmt.Printf("Error. failed to create directory %s\n", outputPath) | ||
} | ||
} | ||
} | ||
|
||
for i := 0; i < len(documents); i++ { | ||
result := <-res | ||
for path, content := range result { | ||
basename := filepath.Base(path) | ||
fileName := strings.TrimSuffix(basename, filepath.Ext(basename)) + ".txt" | ||
filePath := filepath.Join(outputPath, fileName) | ||
file, err := os.Create(filePath) | ||
if err != nil { | ||
fmt.Printf("Error. failed to create file. %v\n", err) | ||
} | ||
_, err = file.WriteString(content) | ||
if err != nil { | ||
fmt.Printf("Error. failed to write content. %v\n", err) | ||
} | ||
} | ||
} | ||
|
||
} | ||
|
||
// concurrentExtraction launches a go routine for each document in `documents` and writes the result of extraction to | ||
// the channel `res`. | ||
func concurrentExtraction(documents []string, res chan map[string]string) error { | ||
|
||
3ace marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for _, docPath := range documents { | ||
filePath := docPath | ||
go func(path string, res chan map[string]string) { | ||
result, err := extractSingleDoc(path) | ||
if err != nil { | ||
fmt.Printf("Error. Failed to extract file %v. %v\n", filePath, err) | ||
} | ||
temp := map[string]string{ | ||
filePath: result, | ||
} | ||
res <- temp | ||
}(filePath, res) | ||
} | ||
return nil | ||
3ace marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
// extractSingleDoc takes a single file specified by the `filePath` and returns the extracted text. | ||
func extractSingleDoc(filePath string) (string, error) { | ||
pdfReader, _, err := model.NewPdfReaderFromFile(filePath, nil) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to create pdf reader: %w", err) | ||
} | ||
pages, err := pdfReader.GetNumPages() | ||
if err != nil { | ||
return "", err | ||
} | ||
var result string | ||
for i := 0; i < pages; i++ { | ||
pageNum := i + 1 | ||
page, err := pdfReader.GetPage(pageNum) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to get page %d: %w", pageNum, err) | ||
} | ||
|
||
ex, err := extractor.New(page) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to create extractor: %w", err) | ||
} | ||
|
||
text, err := ex.ExtractText() | ||
if err != nil { | ||
return "", fmt.Errorf("failed to extract text: %w", err) | ||
} | ||
result += text | ||
} | ||
|
||
return result, nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this same as document length? like
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not necessarily. If we are reading as soon as the go routines are writting to the channel a buffer length of 1 is still enough but I made it 3 just to be safe. I think may be i should increase that in case reading side is very slow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sampila Updated. Can you check again?