From 7d64ced540decbd1c02a20d7bc23d4d34b2dde20 Mon Sep 17 00:00:00 2001 From: Eric Schmidt Date: Fri, 14 Feb 2020 12:09:20 -0800 Subject: [PATCH] feat(samples): adds Document AI parse table, forms samples (#8) --- document-ai/package.json | 7 +- document-ai/parseForm.js | 135 +++++++++++++++++++++++++++ document-ai/parseTable.js | 140 ++++++++++++++++++++++++++++ document-ai/test/parseForm.test.js | 51 ++++++++++ document-ai/test/parseTable.test.js | 51 ++++++++++ 5 files changed, 381 insertions(+), 3 deletions(-) create mode 100644 document-ai/parseForm.js create mode 100644 document-ai/parseTable.js create mode 100644 document-ai/test/parseForm.test.js create mode 100644 document-ai/test/parseTable.test.js diff --git a/document-ai/package.json b/document-ai/package.json index 157f22a9f6..7435dab506 100644 --- a/document-ai/package.json +++ b/document-ai/package.json @@ -10,13 +10,14 @@ "*.js" ], "scripts": { - "test": "mocha test/*.js" + "test": "mocha test/*.js --timeout 600000" }, "dependencies": { - "@google-cloud/documentai": "^0.0.1" + "@google-cloud/documentai": "^0.0.1", + "@google-cloud/storage": "^4.2.0" }, "devDependencies": { "chai": "^4.2.0", "mocha": "^6.2.0" } -} +} \ No newline at end of file diff --git a/document-ai/parseForm.js b/document-ai/parseForm.js new file mode 100644 index 0000000000..81591a76c6 --- /dev/null +++ b/document-ai/parseForm.js @@ -0,0 +1,135 @@ +/** + * Copyright 2020 Google LLC + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const uuid = require('uuid'); + +async function main( + projectId = 'YOUR_PROJECT_ID', + gcsOutputUri = 'output-bucket', + gcsOutputUriPrefix = uuid.v4(), + gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf' +) { + // [START document_parse_form] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const gcsOutputUri = 'YOUR_STORAGE_BUCKET'; + // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX'; + + // Imports the Google Cloud client library + const { + DocumentUnderstandingServiceClient, + } = require('@google-cloud/documentai'); + const {Storage} = require('@google-cloud/storage'); + + const client = new DocumentUnderstandingServiceClient(); + const storage = new Storage(); + + async function parseFormGCS(inputUri, outputUri, outputUriPrefix) { + // Configure the batch process request. + const request = { + inputConfig: { + gcsSource: { + uri: inputUri, + }, + mimeType: 'application/pdf', + }, + outputConfig: { + gcsDestination: { + uri: `${outputUri}${outputUriPrefix}`, + }, + pagesPerShard: 1, + }, + formExtractionParams: { + enabled: true, + keyValuePairHints: [ + { + key: 'Phone', + valueTypes: ['PHONE_NUMBER'], + }, + { + key: 'Contact', + valueTypes: ['EMAIL', 'NAME'], + }, + ], + }, + }; + + // Configure the request for batch process + const requests = { + parent: `projects/${projectId}`, + requests: [request], + }; + + // Batch process document using a long-running operation. + // You can wait for now, or get results later. + const [operation] = await client.batchProcessDocuments(requests); + + // Wait for operation to complete. + await operation.promise(); + + console.log('Document processing complete.'); + + // Query Storage bucket for the results file(s). + const query = { + prefix: outputUriPrefix, + }; + + console.log('Fetching results ...'); + + // List all of the files in the Storage bucket + const [files] = await storage.bucket(gcsOutputUri).getFiles(query); + + files.forEach(async (fileInfo, index) => { + // Get the file as a buffer + const [file] = await fileInfo.download(); + + console.log(`Fetched file #${index + 1}:`); + + // Read the results + const results = JSON.parse(file.toString()); + + // Get all of the document text as one big string. + const text = results.text; + + // Utility to extract text anchors from text field. + const getText = textAnchor => { + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return `\t${text.substring(startIndex, endIndex)}`; + }; + + // Process the output + const [page1] = results.pages; + const formFields = page1.formFields; + + formFields.forEach(field => { + const fieldName = getText(field.fieldName.textAnchor); + const fieldValue = getText(field.fieldValue.textAnchor); + + console.log('Extracted key value pair:'); + console.log(`\t(${fieldName}, ${fieldValue})`); + }); + }); + } + // [END document_parse_form] + + parseFormGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix); +} +main(...process.argv.slice(2)); diff --git a/document-ai/parseTable.js b/document-ai/parseTable.js new file mode 100644 index 0000000000..53d4839e84 --- /dev/null +++ b/document-ai/parseTable.js @@ -0,0 +1,140 @@ +/** + * Copyright 2020 Google LLC + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const uuid = require('uuid'); + +async function main( + projectId = 'YOUR_PROJECT_ID', + gcsOutputUri = 'output-bucket', + gcsOutputUriPrefix = uuid.v4(), + gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf' +) { + // [START document_parse_table] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const gcsOutputUri = 'YOUR_STORAGE_BUCKET'; + // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX'; + + // Imports the Google Cloud client library + const { + DocumentUnderstandingServiceClient, + } = require('@google-cloud/documentai'); + const {Storage} = require('@google-cloud/storage'); + + const client = new DocumentUnderstandingServiceClient(); + const storage = new Storage(); + + async function parseTableGCS(inputUri, outputUri, outputUriPrefix) { + // Configure the batch process request. + const request = { + inputConfig: { + gcsSource: { + uri: inputUri, + }, + mimeType: 'application/pdf', + }, + outputConfig: { + gcsDestination: { + uri: `${outputUri}${outputUriPrefix}`, + }, + pagesPerShard: 1, + }, + tableExtractionParams: { + enabled: true, + tableBoundHints: [ + { + boundingBox: { + normalizedVertices: [ + {x: 0, y: 0}, + {x: 1, y: 0}, + {x: 1, y: 1}, + {x: 0, y: 1}, + ], + }, + }, + ], + }, + }; + + // Configure the request for batch process + const requests = { + parent: `projects/${projectId}`, + requests: [request], + }; + + // Batch process document using a long-running operation. + // You can wait for now, or get results later. + const [operation] = await client.batchProcessDocuments(requests); + + // Wait for operation to complete. + await operation.promise(); + + console.log('Document processing complete.'); + + // Query Storage bucket for the results file(s). + const query = { + prefix: outputUriPrefix, + }; + + console.log('Fetching results ...'); + + // List all of the files in the Storage bucket + const [files] = await storage.bucket(gcsOutputUri).getFiles(query); + + files.forEach(async (fileInfo, index) => { + // Get the file as a buffer + const [file] = await fileInfo.download(); + + console.log(`Fetched file #${index + 1}:`); + + // Read the results + const results = JSON.parse(file.toString()); + + // Get all of the document text as one big string + const text = results.text; + + // Get the first table in the document + const [page1] = results.pages; + const [table] = page1.tables; + const [headerRow] = table.headerRows; + + console.log('Results from first table processed:'); + console.log( + `First detected language: ${page1.detectedLanguages[0].languageCode}` + ); + + console.log('Header row:'); + headerRow.cells.forEach(tableCell => { + if (tableCell.layout.textAnchor.textSegments) { + // Extract shards from the text field + // First shard in document doesn't have startIndex property + const startIndex = + tableCell.layout.textAnchor.textSegments[0].startIndex || 0; + const endIndex = tableCell.layout.textAnchor.textSegments[0].endIndex; + + console.log(`\t${text.substring(startIndex, endIndex)}`); + } + }); + }); + } + // [END document_parse_table] + + parseTableGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix); +} +main(...process.argv.slice(2)); diff --git a/document-ai/test/parseForm.test.js b/document-ai/test/parseForm.test.js new file mode 100644 index 0000000000..fd07c27dc8 --- /dev/null +++ b/document-ai/test/parseForm.test.js @@ -0,0 +1,51 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +'use strict'; + +const {Storage} = require('@google-cloud/storage'); +const cp = require('child_process'); +const {assert} = require('chai'); +const {describe, it, before, after} = require('mocha'); +const uuid = require('uuid'); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const storage = new Storage(); +const bucketName = `nodejs-docs-samples-test-${uuid.v4()}`; +const cmd = `node parseForm.js`; + +const testParseForm = { + projectId: process.env.GCLOUD_PROJECT, + gcsOutputUriPrefix: uuid.v4(), +}; + +describe(`Document AI parse form`, () => { + before(async () => { + await storage.createBucket(bucketName); + }); + + after(async () => { + const bucket = storage.bucket(bucketName); + await bucket.deleteFiles({force: true}); + await bucket.delete(); + }); + + it(`should parse the GCS invoice example as a form`, async () => { + const output = execSync( + `${cmd} ${testParseForm.projectId} gs://${bucketName}/` + ); + assert.match(output, /Extracted key value pair:/); + }); +}); diff --git a/document-ai/test/parseTable.test.js b/document-ai/test/parseTable.test.js new file mode 100644 index 0000000000..2ff0a6a466 --- /dev/null +++ b/document-ai/test/parseTable.test.js @@ -0,0 +1,51 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +'use strict'; + +const {Storage} = require('@google-cloud/storage'); +const cp = require('child_process'); +const {assert} = require('chai'); +const {describe, it, before, after} = require('mocha'); +const uuid = require('uuid'); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const storage = new Storage(); +const bucketName = `nodejs-docs-samples-test-${uuid.v4()}`; +const cmd = `node parseTable.js`; + +const testParseTable = { + projectId: process.env.GCLOUD_PROJECT, + gcsOutputUriPrefix: uuid.v4(), +}; + +describe(`Document AI parse table`, () => { + before(async () => { + await storage.createBucket(bucketName); + }); + + after(async () => { + const bucket = storage.bucket(bucketName); + await bucket.deleteFiles({force: true}); + await bucket.delete(); + }); + + it(`should parse the GCS invoice example as as table`, async () => { + const output = execSync( + `${cmd} ${testParseTable.projectId} gs://${bucketName}/` + ); + assert.match(output, /First detected language: en/); + }); +});