Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Azure PDF scan #282

Merged
merged 14 commits into from
Aug 26, 2021
16 changes: 16 additions & 0 deletions includes/Classifai/Helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -509,3 +509,19 @@ function get_modified_image_source_url( $post_id ) {
*/
return apply_filters( 'classifai_generate_image_alt_tags_source_url', null, $post_id );
}

/**
* Check if attachment is PDF document.
*
* @param \WP_post $post Post object for the attachment being viewed.
*/
function attachment_is_pdf( $post ) {
$mime_type = get_post_mime_type( $post );
$matched_extensions = explode( '|', array_search( $mime_type, wp_get_mime_types(), true ) );

if ( in_array( 'pdf', $matched_extensions, true ) ) {
return true;
}

return false;
}
143 changes: 134 additions & 9 deletions includes/Classifai/Providers/Azure/ComputerVision.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
use function Classifai\computer_vision_max_filesize;
use function Classifai\get_largest_acceptable_image_url;
use function Classifai\get_modified_image_source_url;
use function Classifai\attachment_is_pdf;

class ComputerVision extends Provider {

Expand Down Expand Up @@ -55,6 +56,7 @@ private function get_default_settings() {
'enable_image_tagging' => true,
'enable_smart_cropping' => false,
'enable_ocr' => false,
'enable_read_pdf' => false,
'caption_threshold' => 75,
'tag_threshold' => 70,
'image_tag_taxonomy' => 'classifai-image-tags',
Expand Down Expand Up @@ -97,6 +99,12 @@ public function register() {
add_filter( 'the_content', [ $this, 'add_ocr_aria_describedby' ] );
add_filter( 'rest_api_init', [ $this, 'add_ocr_data_to_api_response' ] );
}

$enable_read_pdf = isset( $settings['enable_read_pdf'] ) && '1' === $settings['enable_read_pdf'];
if ( $enable_read_pdf ) {
add_action( 'add_attachment', [ $this, 'read_pdf' ] );
add_action( 'classifai_retry_get_read_result', [ $this, 'do_read_cron' ], 10, 2 );
}
}

/**
Expand Down Expand Up @@ -192,16 +200,33 @@ public function add_ocr_aria_describedby( $content ) {

/**
* Adds a meta box for rescanning options if the settings are configured
*
* @param \WP_Post $post The post object.
*/
public function setup_attachment_meta_box() {
add_meta_box(
'attachment_meta_box',
__( 'ClassifAI Image Processing', 'classifai' ),
[ $this, 'attachment_data_meta_box' ],
'attachment',
'side',
'high'
);
public function setup_attachment_meta_box( $post ) {
$settings = get_option( 'classifai_computer_vision' );

if ( wp_attachment_is_image( $post ) ) {
add_meta_box(
'attachment_meta_box',
__( 'ClassifAI Image Processing', 'classifai' ),
[ $this, 'attachment_data_meta_box' ],
'attachment',
'side',
'high'
);
}

if ( attachment_is_pdf( $post ) && $settings && isset( $settings['enable_read_pdf'] ) && '1' === $settings['enable_read_pdf'] ) {
add_meta_box(
'attachment_meta_box',
__( 'ClassifAI PDF Processing', 'classifai' ),
[ $this, 'attachment_pdf_data_meta_box' ],
'attachment',
'side',
'high'
);
}
}

/**
Expand Down Expand Up @@ -249,6 +274,34 @@ public function attachment_data_meta_box( $post ) {
<?php
}

/**
* Display PDF scanning actions.
*
* @param \WP_Post $post The post object.
*/
public function attachment_pdf_data_meta_box( $post ) {
$read = empty( get_the_content( null, false, $post ) ) ? __( 'Scan PDF content', 'classifai' ) : __( 'Rescan PDF content', 'classifai' );
$status = get_post_meta( $post->ID, '_classifai_azure_read_status', true );
if ( ! empty( $status['status'] ) && 'running' === $status['status'] ) {
$running = true;
} else {
$running = false;
}
?>
<div class="misc-publishing-actions">
<div class="misc-pub-section">
<label for="rescan-pdf">
<input type="checkbox" value="yes" id="rescan-pdf" name="rescan-pdf" <?php disabled( $running ); ?>/>
<?php echo esc_html( $read ); ?>
<?php if ( $running ) : ?>
<?php echo ' - ' . esc_html__( 'In progress!', 'classifai' ); ?>
<?php endif; ?>
</label>
</div>
</div>
<?php
}

/**
*
* @param int $attachment_id Post id for the attachment
Expand Down Expand Up @@ -301,6 +354,11 @@ public function maybe_rescan_image( $attachment_id ) {
if ( filter_input( INPUT_POST, 'rescan-ocr', FILTER_SANITIZE_STRING ) ) {
$this->ocr_processing( wp_get_attachment_metadata( $attachment_id ), $attachment_id, true );
}

if ( filter_input( INPUT_POST, 'rescan-pdf' ) ) {
$this->read_pdf( $attachment_id );
}

}

/**
Expand Down Expand Up @@ -568,6 +626,52 @@ protected function generate_alt_tags( $captions, $attachment_id ) {
return $rtn;
}

/**
* Read PDF content and update the description of attachment.
*
* @param int $attachment_id Attachment ID.
*/
public function read_pdf( $attachment_id ) {
$settings = $this->get_settings();

if ( ! is_array( $settings ) ) {
return new WP_Error( 'invalid_settings', 'Can not retrieve the plugin settings.' );
}

$should_read_pdf = isset( $settings['enable_read_pdf'] ) && '1' === $settings['enable_read_pdf'];

if ( ! $should_read_pdf ) {
return false;
}

// Direct file system access is required for the current implementation of this feature.
if ( ! function_exists( 'get_filesystem_method' ) ) {
require_once( ABSPATH . 'wp-admin/includes/file.php' );
}

$access_type = get_filesystem_method();

if ( 'direct' !== $access_type || ! WP_Filesystem() ) {
return new WP_Error( 'invalid_access_type', 'Invalid access type! Direct file system access is required.' );
}

$read = new Read( $settings, intval( $attachment_id ) );

return $read->read_document();
}

/**
* Wrapper action callback for Read cron job.
*
* @param string $operation_url Operation URL for checking the read status.
* @param int $attachment_id Attachment ID.
*/
public function do_read_cron( $operation_url, $attachment_id ) {
$settings = $this->get_settings();

( new Read( $settings, intval( $attachment_id ) ) )->check_read_result( $operation_url );
}

/**
* Generate the image tags for the image being uploaded.
*
Expand Down Expand Up @@ -761,6 +865,22 @@ public function setup_fields_sections() {
),
]
);
add_settings_field(
'enable-read-pdf',
esc_html__( 'Enable Scanning PDF', 'classifai' ),
[ $this, 'render_input' ],
$this->get_option_name(),
$this->get_option_name(),
[
'label_for' => 'enable_read_pdf',
'input_type' => 'checkbox',
'default_value' => $default_settings['enable_read_pdf'],
'description' => __(
'Extract visible text from multi-pages PDF documents. Store the result as the attachment description.',
'classifai'
),
]
);
}

/**
Expand Down Expand Up @@ -795,6 +915,7 @@ public function sanitize_settings( $settings ) {
'enable_image_tagging',
'enable_smart_cropping',
'enable_ocr',
'enable_read_pdf',
];

foreach ( $checkbox_settings as $checkbox_setting ) {
Expand Down Expand Up @@ -963,6 +1084,10 @@ public function rest_endpoint_callback( $post_id, $route_to_call ) {
return $this->ocr_processing( $metadata, $post_id, true );
}

if ( 'read-pdf' === $route_to_call ) {
return $this->read_pdf( $post_id );
}

// Allow rescanning image that are not stored in local storage.
$image_url = get_modified_image_source_url( $post_id );

Expand Down
Loading