diff --git a/document-ai/.eslintrc.yml b/document-ai/.eslintrc.yml new file mode 100644 index 0000000000..98634adbef --- /dev/null +++ b/document-ai/.eslintrc.yml @@ -0,0 +1,4 @@ +--- +rules: + no-console: off + node/no-unsupported-features/node-builtins: off diff --git a/document-ai/batch-process-document.js b/document-ai/batch-process-document.js new file mode 100644 index 0000000000..bb9d5ad3ed --- /dev/null +++ b/document-ai/batch-process-document.js @@ -0,0 +1,149 @@ +/** + * Copyright 2020 Google LLC + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const uuid = require('uuid'); + +async function main( + projectId = 'YOUR_PROJECT_ID', + location = 'YOUR_PROJECT_LOCATION', + processorId = 'YOUR_PROCESSOR_ID', // Create this in the Cloud Console + gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf', + gcsOutputUri = 'output-bucket', + gcsOutputUriPrefix = uuid.v4() +) { + // [START documentai_batch_process_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; + // const gcsInputUri = 'YOUR_SOURCE_PDF'; + // const gcsOutputUri = 'YOUR_STORAGE_BUCKET'; + // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX'; + + // Imports the Google Cloud client library + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1; + const {Storage} = require('@google-cloud/storage'); + + // Instantiates Document AI, Storage clients + const client = new DocumentProcessorServiceClient(); + const storage = new Storage(); + + const {default: PQueue} = require('p-queue'); + + async function batchProcessDocument() { + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Configure the batch process request. + const request = { + name, + inputDocuments: { + gcsDocuments: { + documents: [ + { + gcsUri: gcsInputUri, + mimeType: 'application/pdf', + }, + ], + }, + }, + documentOutputConfig: { + gcsOutputConfig: { + gcsUri: `${gcsOutputUri}/${gcsOutputUriPrefix}/`, + }, + }, + }; + + // Batch process document using a long-running operation. + // You can wait for now, or get results later. + // Note: first request to the service takes longer than subsequent + // requests. + const [operation] = await client.batchProcessDocuments(request); + + // Wait for operation to complete. + await operation.promise(); + console.log('Document processing complete.'); + + // Query Storage bucket for the results file(s). + const query = { + prefix: gcsOutputUriPrefix, + }; + + console.log('Fetching results ...'); + + // List all of the files in the Storage bucket + const [files] = await storage.bucket(gcsOutputUri).getFiles(query); + + // Add all asynchronous downloads to queue for execution. + const queue = new PQueue({concurrency: 15}); + const tasks = files.map((fileInfo, index) => async () => { + // Get the file as a buffer + const [file] = await fileInfo.download(); + + console.log(`Fetched file #${index + 1}:`); + + // The results stored in the output Storage location + // are formatted as a document object. + const document = JSON.parse(file.toString()); + const {text} = document; + + // Extract shards from the text field + const getText = textAnchor => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + + // First shard in document doesn't have startIndex property + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return text.substring(startIndex, endIndex); + }; + + // Read the text recognition output from the processor + console.log('The document contains the following paragraphs:'); + + const [page1] = document.pages; + const {paragraphs} = page1; + for (const paragraph of paragraphs) { + const paragraphText = getText(paragraph.layout.textAnchor); + console.log(`Paragraph text:\n${paragraphText}`); + } + + // Form parsing provides additional output about + // form-formatted PDFs. You must create a form + // processor in the Cloud Console to see full field details. + console.log('\nThe following form key/value pairs were detected:'); + + const {formFields} = page1; + for (const field of formFields) { + const fieldName = getText(field.fieldName.textAnchor); + const fieldValue = getText(field.fieldValue.textAnchor); + + console.log('Extracted key value pair:'); + console.log(`\t(${fieldName}, ${fieldValue})`); + } + }); + await queue.addAll(tasks); + } + // [END documentai_batch_process_document] + + batchProcessDocument(); +} +main(...process.argv.slice(2)); diff --git a/document-ai/package.json b/document-ai/package.json new file mode 100644 index 0000000000..5f5b7a6ae3 --- /dev/null +++ b/document-ai/package.json @@ -0,0 +1,25 @@ +{ + "name": "nodejs-document-ai-samples", + "private": true, + "license": "Apache-2.0", + "author": "Google LLC", + "engines": { + "node": ">=12.0.0" + }, + "files": [ + "*.js" + ], + "scripts": { + "test": "mocha test/*.js --timeout 600000" + }, + "dependencies": { + "@google-cloud/documentai": "^6.1.0", + "@google-cloud/storage": "^6.0.0", + "p-queue": "^6.6.2", + "uuid": "^9.0.0" + }, + "devDependencies": { + "chai": "^4.2.0", + "mocha": "^8.0.0" + } +} diff --git a/document-ai/process-document-form.js b/document-ai/process-document-form.js new file mode 100644 index 0000000000..bf0459d93e --- /dev/null +++ b/document-ai/process-document-form.js @@ -0,0 +1,132 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_process_form_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); + + async function processDocument() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + rawDocument: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + + console.log('Document processing complete.'); + + // Read the table and form fields output from the processor + // The form processor also contains OCR data. For more information + // on how to parse OCR data please see the OCR sample. + // For a full list of Document object attributes, + // please reference this page: https://googleapis.dev/nodejs/documentai/latest/index.html + const {document} = result; + const {text} = document; + console.log(`Full document text: ${JSON.stringify(text)}`); + console.log(`There are ${document.pages.length} page(s) in this document.`); + + for (const page of document.pages) { + console.log(`\n\n**** Page ${page.pageNumber} ****`); + + console.log(`Found ${page.tables.length} table(s):`); + for (const table of page.tables) { + const numCollumns = table.headerRows[0].cells.length; + const numRows = table.bodyRows.length; + console.log(`Table with ${numCollumns} columns and ${numRows} rows:`); + printTableInfo(table, text); + } + console.log(`Found ${page.formFields.length} form field(s):`); + for (const field of page.formFields) { + const fieldName = getText(field.fieldName.textAnchor, text); + const fieldValue = getText(field.fieldValue.textAnchor, text); + console.log( + `\t* ${JSON.stringify(fieldName)}: ${JSON.stringify(fieldValue)}` + ); + } + } + } + + const printTableInfo = (table, text) => { + // Print header row + let headerRowText = ''; + for (const headerCell of table.headerRows[0].cells) { + const headerCellText = getText(headerCell.layout.textAnchor, text); + headerRowText += `${JSON.stringify(headerCellText.trim())} | `; + } + console.log( + `Collumns: ${headerRowText.substring(0, headerRowText.length - 3)}` + ); + // Print first body row + let bodyRowText = ''; + for (const bodyCell of table.bodyRows[0].cells) { + const bodyCellText = getText(bodyCell.layout.textAnchor, text); + bodyRowText += `${JSON.stringify(bodyCellText.trim())} | `; + } + console.log( + `First row data: ${bodyRowText.substring(0, bodyRowText.length - 3)}` + ); + }; + + // Extract shards from the text field + const getText = (textAnchor, text) => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + + // First shard in document doesn't have startIndex property + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return text.substring(startIndex, endIndex); + }; + + // [END documentai_process_form_document] + await processDocument(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/process-document-ocr.js b/document-ai/process-document-ocr.js new file mode 100644 index 0000000000..5aadeeab92 --- /dev/null +++ b/document-ai/process-document-ocr.js @@ -0,0 +1,166 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_process_ocr_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); + + async function processDocument() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + rawDocument: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + + console.log('Document processing complete.'); + + // Read the text recognition output from the processor + // For a full list of Document object attributes, + // please reference this page: https://googleapis.dev/nodejs/documentai/latest/index.html + const {document} = result; + const {text} = document; + + // Read the text recognition output from the processor + console.log(`Full document text: ${JSON.stringify(text)}`); + console.log(`There are ${document.pages.length} page(s) in this document.`); + for (const page of document.pages) { + console.log(`Page ${page.pageNumber}`); + printPageDimensions(page.dimension); + printDetectedLanguages(page.detectedLanguages); + printParagraphs(page.paragraphs, text); + printBlocks(page.blocks, text); + printLines(page.lines, text); + printTokens(page.tokens, text); + } + } + + const printPageDimensions = dimension => { + console.log(` Width: ${dimension.width}`); + console.log(` Height: ${dimension.height}`); + }; + + const printDetectedLanguages = detectedLanguages => { + console.log(' Detected languages:'); + for (const lang of detectedLanguages) { + const code = lang.languageCode; + const confPercent = lang.confidence * 100; + console.log(` ${code} (${confPercent.toFixed(2)}% confidence)`); + } + }; + + const printParagraphs = (paragraphs, text) => { + console.log(` ${paragraphs.length} paragraphs detected:`); + const firstParagraphText = getText(paragraphs[0].layout.textAnchor, text); + console.log( + ` First paragraph text: ${JSON.stringify(firstParagraphText)}` + ); + const lastParagraphText = getText( + paragraphs[paragraphs.length - 1].layout.textAnchor, + text + ); + console.log( + ` Last paragraph text: ${JSON.stringify(lastParagraphText)}` + ); + }; + + const printBlocks = (blocks, text) => { + console.log(` ${blocks.length} blocks detected:`); + const firstBlockText = getText(blocks[0].layout.textAnchor, text); + console.log(` First block text: ${JSON.stringify(firstBlockText)}`); + const lastBlockText = getText( + blocks[blocks.length - 1].layout.textAnchor, + text + ); + console.log(` Last block text: ${JSON.stringify(lastBlockText)}`); + }; + + const printLines = (lines, text) => { + console.log(` ${lines.length} lines detected:`); + const firstLineText = getText(lines[0].layout.textAnchor, text); + console.log(` First line text: ${JSON.stringify(firstLineText)}`); + const lastLineText = getText( + lines[lines.length - 1].layout.textAnchor, + text + ); + console.log(` Last line text: ${JSON.stringify(lastLineText)}`); + }; + + const printTokens = (tokens, text) => { + console.log(` ${tokens.length} tokens detected:`); + const firstTokenText = getText(tokens[0].layout.textAnchor, text); + console.log(` First token text: ${JSON.stringify(firstTokenText)}`); + const firstTokenBreakType = tokens[0].detectedBreak.type; + console.log(` First token break type: ${firstTokenBreakType}`); + const lastTokenText = getText( + tokens[tokens.length - 1].layout.textAnchor, + text + ); + console.log(` Last token text: ${JSON.stringify(lastTokenText)}`); + const lastTokenBreakType = tokens[tokens.length - 1].detectedBreak.type; + console.log(` Last token break type: ${lastTokenBreakType}`); + }; + + // Extract shards from the text field + const getText = (textAnchor, text) => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + + // First shard in document doesn't have startIndex property + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return text.substring(startIndex, endIndex); + }; + + // [END documentai_process_ocr_document] + await processDocument(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/process-document-quality.js b/document-ai/process-document-quality.js new file mode 100644 index 0000000000..80d3a291fa --- /dev/null +++ b/document-ai/process-document-quality.js @@ -0,0 +1,87 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_process_quality_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); + + async function processDocument() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + rawDocument: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + + console.log('Document processing complete.'); + + // Read the quality-specific information from the output from the + // Intelligent Document Quality Processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_doc-quality-processor + // OCR and other data is also present in the quality processor's response. + // Please see the OCR and other samples for how to parse other data in the + // response. + const {document} = result; + for (const entity of document.entities) { + const entityConf = entity.confidence * 100; + const pageNum = parseInt(entity.pageAnchor.pageRefs.page) + 1 || 1; + console.log( + `Page ${pageNum} has a quality score of ${entityConf.toFixed(2)}%:` + ); + for (const prop of entity.properties) { + const propConf = prop.confidence * 100; + console.log(`\t* ${prop.type} score of ${propConf.toFixed(2)}%`); + } + } + } + + // [END documentai_process_quality_document] + await processDocument(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/process-document-specialized.js b/document-ai/process-document-specialized.js new file mode 100644 index 0000000000..ceed144893 --- /dev/null +++ b/document-ai/process-document-specialized.js @@ -0,0 +1,95 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_process_specialized_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); + + async function processDocument() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + rawDocument: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + + console.log('Document processing complete.'); + + // Read fields specificly from the specalized US drivers license processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_us-driver-license-parser + // retriving data from other specalized processors follow a similar pattern. + // For a complete list of processors see: + // https://cloud.google.com/document-ai/docs/processors-list + // + // OCR and other data is also present in the quality processor's response. + // Please see the OCR and other samples for how to parse other data in the + // response. + const {document} = result; + for (const entity of document.entities) { + // Fields detected. For a full list of fields for each processor see + // the processor documentation: + // https://cloud.google.com/document-ai/docs/processors-list + const key = entity.type; + // some other value formats in addition to text are availible + // e.g. dates: `entity.normalizedValue.dateValue.year` + const textValue = + entity.textAnchor !== null ? entity.textAnchor.content : ''; + const conf = entity.confidence * 100; + console.log( + `* ${JSON.stringify(key)}: ${JSON.stringify(textValue)}(${conf.toFixed( + 2 + )}% confident)` + ); + } + } + + // [END documentai_process_specialized_document] + await processDocument(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/process-document-splitter.js b/document-ai/process-document-splitter.js new file mode 100644 index 0000000000..4d23c9be06 --- /dev/null +++ b/document-ai/process-document-splitter.js @@ -0,0 +1,107 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_process_splitter_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); + + async function processDocument() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + rawDocument: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + + console.log('Document processing complete.'); + + // Read fields specificly from the specalized US drivers license processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_us-driver-license-parser + // retriving data from other specalized processors follow a similar pattern. + // For a complete list of processors see: + // https://cloud.google.com/document-ai/docs/processors-list + // + // OCR and other data is also present in the quality processor's response. + // Please see the OCR and other samples for how to parse other data in the + // response. + const {document} = result; + console.log(`Found ${document.entities.length} subdocuments:`); + for (const entity of document.entities) { + const conf = entity.confidence * 100; + const pagesRange = pageRefsToRange(entity.pageAnchor.pageRefs); + if (entity.type !== '') { + console.log( + `${conf.toFixed(2)}% confident that ${pagesRange} a "${ + entity.type + }" subdocument.` + ); + } else { + console.log( + `${conf.toFixed(2)}% confident that ${pagesRange} a subdocument.` + ); + } + } + } + + // Converts a page ref to a string describing the page or page range. + const pageRefsToRange = pageRefs => { + if (pageRefs.length === 1) { + const num = parseInt(pageRefs[0].page) + 1 || 1; + return `page ${num} is`; + } else { + const start = parseInt(pageRefs[0].page) + 1 || 1; + const end = parseInt(pageRefs[1].page) + 1; + return `pages ${start} to ${end} are`; + } + }; + + // [END documentai_process_splitter_document] + await processDocument(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/process-document.js b/document-ai/process-document.js new file mode 100644 index 0000000000..f099b26a20 --- /dev/null +++ b/document-ai/process-document.js @@ -0,0 +1,106 @@ +/** + * Copyright 2020, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_process_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); + + async function processDocument() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + rawDocument: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + const {document} = result; + + // Get all of the document text as one big string + const {text} = document; + + // Extract shards from the text field + const getText = textAnchor => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + + // First shard in document doesn't have startIndex property + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return text.substring(startIndex, endIndex); + }; + + // Read the text recognition output from the processor + console.log('The document contains the following paragraphs:'); + const [page1] = document.pages; + const {paragraphs} = page1; + + for (const paragraph of paragraphs) { + const paragraphText = getText(paragraph.layout.textAnchor); + console.log(`Paragraph text:\n${paragraphText}`); + } + + // Form parsing provides additional output about + // form-formatted PDFs. You must create a form + // processor in the Cloud Console to see full field details. + console.log('\nThe following form key/value pairs were detected:'); + + const {formFields} = page1; + for (const field of formFields) { + const fieldName = getText(field.fieldName.textAnchor); + const fieldValue = getText(field.fieldValue.textAnchor); + + console.log('Extracted key value pair:'); + console.log(`\t(${fieldName}, ${fieldValue})`); + } + } + // [END documentai_process_document] + await processDocument(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/quickstart.js b/document-ai/quickstart.js new file mode 100644 index 0000000000..a1fe28e1ad --- /dev/null +++ b/document-ai/quickstart.js @@ -0,0 +1,94 @@ +/** + * Copyright 2020, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_quickstart] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1; + + // Instantiates a client + // apiEndpoint regions available: eu-documentai.googleapis.com, us-documentai.googleapis.com (Required if using eu based processor) + // const client = new DocumentProcessorServiceClient({apiEndpoint: 'eu-documentai.googleapis.com'}); + const client = new DocumentProcessorServiceClient(); + + async function quickstart() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + rawDocument: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + const {document} = result; + + // Get all of the document text as one big string + const {text} = document; + + // Extract shards from the text field + const getText = textAnchor => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + + // First shard in document doesn't have startIndex property + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return text.substring(startIndex, endIndex); + }; + + // Read the text recognition output from the processor + console.log('The document contains the following paragraphs:'); + const [page1] = document.pages; + const {paragraphs} = page1; + + for (const paragraph of paragraphs) { + const paragraphText = getText(paragraph.layout.textAnchor); + console.log(`Paragraph text:\n${paragraphText}`); + } + } + // [END documentai_quickstart] + await quickstart(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/resources/document_quality_poor.pdf b/document-ai/resources/document_quality_poor.pdf new file mode 100644 index 0000000000..3a34a925c0 Binary files /dev/null and b/document-ai/resources/document_quality_poor.pdf differ diff --git a/document-ai/resources/handwritten_form.pdf b/document-ai/resources/handwritten_form.pdf new file mode 100644 index 0000000000..2189ffffd0 Binary files /dev/null and b/document-ai/resources/handwritten_form.pdf differ diff --git a/document-ai/resources/invoice.pdf b/document-ai/resources/invoice.pdf new file mode 100644 index 0000000000..7722734a43 Binary files /dev/null and b/document-ai/resources/invoice.pdf differ diff --git a/document-ai/resources/multi_document.pdf b/document-ai/resources/multi_document.pdf new file mode 100644 index 0000000000..7ea62eb8f7 Binary files /dev/null and b/document-ai/resources/multi_document.pdf differ diff --git a/document-ai/resources/us_driver_license.pdf b/document-ai/resources/us_driver_license.pdf new file mode 100644 index 0000000000..f8f62d902e Binary files /dev/null and b/document-ai/resources/us_driver_license.pdf differ diff --git a/document-ai/test/.eslintrc.yml b/document-ai/test/.eslintrc.yml new file mode 100644 index 0000000000..29af919cd8 --- /dev/null +++ b/document-ai/test/.eslintrc.yml @@ -0,0 +1,5 @@ +--- +env: + mocha: true +rules: + node/no-extraneous-require: off diff --git a/document-ai/test/batch-process-document.test.js b/document-ai/test/batch-process-document.test.js new file mode 100644 index 0000000000..4e58fdb776 --- /dev/null +++ b/document-ai/test/batch-process-document.test.js @@ -0,0 +1,61 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +'use strict'; + +const {Storage} = require('@google-cloud/storage'); +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const cp = require('child_process'); +const assert = require('assert'); +const {describe, it, before, after} = require('mocha'); +const uuid = require('uuid'); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const storage = new Storage(); +const bucketName = `nodejs-docs-samples-test-${uuid.v4()}`; +const cmd = 'node batch-process-document.js'; + +const testProcessDocument = { + projectId: '', + location: 'us', + processorId: '8f1123c1b125e0b7', + gcsInputUri: 'gs://cloud-samples-data/documentai/invoice.pdf', + gcsOutputUriPrefix: uuid.v4(), +}; + +describe('Document AI batch parse form', () => { + before(async () => { + testProcessDocument.projectId = await client.getProjectId(); + await storage.createBucket(bucketName); + }); + + after(async () => { + const bucket = storage.bucket(bucketName); + await bucket.deleteFiles({force: true}); + await bucket.delete(); + }); + + it('should parse the GCS invoice example as a form', async () => { + const output = execSync( + `${cmd} ${testProcessDocument.projectId} ${testProcessDocument.location} ${testProcessDocument.processorId} ${testProcessDocument.gcsInputUri} gs://${bucketName} ${testProcessDocument.gcsOutputUriPrefix}` + ); + assert.notStrictEqual(output.indexOf('Extracted'), -1); + }); +}); diff --git a/document-ai/test/process-document-form.test.js b/document-ai/test/process-document-form.test.js new file mode 100644 index 0000000000..3f4e457e5d --- /dev/null +++ b/document-ai/test/process-document-form.test.js @@ -0,0 +1,60 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = '8f1123c1b125e0b7'; + +const fileName = 'invoice.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Process form document', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run document (process form) (v1)', async () => { + const stdout = execSync( + `node ./process-document-form.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual( + stdout.indexOf('There are 1 page(s) in this document.'), + -1 + ); + assert.notStrictEqual( + stdout.indexOf('Table with 4 columns and 6 rows'), + -1 + ); + assert.notStrictEqual(stdout.indexOf('Found 13 form field(s)'), -1); + assert.notStrictEqual(stdout.indexOf('$2140.00'), -1); + }); +}); diff --git a/document-ai/test/process-document-ocr.test.js b/document-ai/test/process-document-ocr.test.js new file mode 100644 index 0000000000..8fe98ac40a --- /dev/null +++ b/document-ai/test/process-document-ocr.test.js @@ -0,0 +1,53 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = 'd9f262d374c21325'; + +const fileName = 'handwritten_form.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Process OCR document', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run document (process invoice) (v1)', async () => { + const stdout = execSync( + `node ./process-document-ocr.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Page 1'), -1); + assert.notStrictEqual(stdout.indexOf('en'), -1); + assert.notStrictEqual(stdout.indexOf('FakeDoc'), -1); + }); +}); diff --git a/document-ai/test/process-document-quality.test.js b/document-ai/test/process-document-quality.test.js new file mode 100644 index 0000000000..4976b61d8e --- /dev/null +++ b/document-ai/test/process-document-quality.test.js @@ -0,0 +1,53 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = 'face29cdcce978c0'; + +const fileName = 'document_quality_poor.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Process quality document', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run document (process quality) (v1beta3)', async () => { + const stdout = execSync( + `node ./process-document-quality.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Page 1 has a quality score of'), -1); + assert.notStrictEqual(stdout.indexOf('defect_blurry score of 9'), -1); + assert.notStrictEqual(stdout.indexOf('defect_noisy'), -1); + }); +}); diff --git a/document-ai/test/process-document-specialized.test.js b/document-ai/test/process-document-specialized.test.js new file mode 100644 index 0000000000..d937a06c46 --- /dev/null +++ b/document-ai/test/process-document-specialized.test.js @@ -0,0 +1,52 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = 'bb340a5e47a6c1e'; + +const fileName = 'us_driver_license.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Process specialized document', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run document (specialized) (v1beta3)', async () => { + const stdout = execSync( + `node ./process-document-specialized.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Document Id'), -1); + assert.notStrictEqual(stdout.indexOf('97551579'), -1); + }); +}); diff --git a/document-ai/test/process-document-splitter.test.js b/document-ai/test/process-document-splitter.test.js new file mode 100644 index 0000000000..a7fecb63ce --- /dev/null +++ b/document-ai/test/process-document-splitter.test.js @@ -0,0 +1,59 @@ +/** + * Copyright 2021, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = '8f447646e4ec6fa2'; + +const fileName = 'multi_document.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Process splitter document', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run document (splitter) (v1beta3)', async () => { + const stdout = execSync( + `node ./process-document-splitter.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Found 8 subdocuments'), -1); + assert.notStrictEqual( + stdout.indexOf('confident that pages 1 to 2 are a subdocument'), + -1 + ); + assert.notStrictEqual( + stdout.indexOf('confident that page 10 is a subdocument'), + -1 + ); + }); +}); diff --git a/document-ai/test/process-document.test.js b/document-ai/test/process-document.test.js new file mode 100644 index 0000000000..69bcc77722 --- /dev/null +++ b/document-ai/test/process-document.test.js @@ -0,0 +1,52 @@ +/** + * Copyright 2019, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = '8f1123c1b125e0b7'; + +const fileName = 'invoice.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Process document', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run document (process invoice) (v1)', async () => { + const stdout = execSync( + `node ./process-document.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Paragraph'), -1); + assert.notStrictEqual(stdout.indexOf('Extracted'), -1); + }); +}); diff --git a/document-ai/test/quickstart.test.js b/document-ai/test/quickstart.test.js new file mode 100644 index 0000000000..ae274f1d50 --- /dev/null +++ b/document-ai/test/quickstart.test.js @@ -0,0 +1,52 @@ +/** + * Copyright 2019, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const {DocumentProcessorServiceClient} = + require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = '8f1123c1b125e0b7'; + +const fileName = 'invoice.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Quickstart', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + + it('should run quickstart', async () => { + const stdout = execSync( + `node ./quickstart.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Paragraph'), -1); + }); +});