Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hOCR option to Text Extraction Media Attachment action and IIIF Manifest #897

Merged
merged 11 commits into from
Oct 21, 2022
34 changes: 30 additions & 4 deletions modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php
Original file line number Diff line number Diff line change
Expand Up @@ -189,21 +189,27 @@ public function render() {
*/
protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_base_id) {
$canvases = [];
foreach ($this->options['iiif_tile_field'] as $iiif_tile_field) {
foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) {
$viewsField = $this->view->field[$iiif_tile_field];
$iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])) : [];
$ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL;
$entity = $viewsField->getEntity($row);

if (isset($entity->{$viewsField->definition['field_name']})) {

/** @var \Drupal\Core\Field\FieldItemListInterface $images */
$images = $entity->{$viewsField->definition['field_name']};
foreach ($images as $image) {
foreach ($images as $i => $image) {
if (!$image->entity->access('view')) {
// If the user does not have permission to view the file, skip it.
continue;
}

$ocrs = $entity->{$ocrField->definition['field_name']};

// Create the IIIF URL for this file
// Visiting $iiif_url will resolve to the info.json for the image.
$ocr = isset($ocrs[$i]) ? $ocrs[$i] : FALSE;
$file_url = $image->entity->createFileUrl(FALSE);
$mime_type = $image->entity->getMimeType();
$iiif_url = rtrim($iiif_address, '/') . '/' . urlencode($file_url);
Expand Down Expand Up @@ -241,8 +247,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas
}
}
}

$canvases[] = [
$tmp_canvas = [
// @see https://iiif.io/api/presentation/2.1/#canvas
'@id' => $canvas_id,
'@type' => 'sc:Canvas',
Expand Down Expand Up @@ -271,6 +276,17 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas
],
],
];

if (isset($ocr) && $ocr != FALSE) {
$tmp_canvas['seeAlso'] = [
'@id' => $ocr->entity->createFileUrl(FALSE),
'format' => 'text/vnd.hocr+html',
'profile' => 'http://kba.cloud/hocr-spec',
'label' => 'hOCR embedded text',
];
}

$canvases[] = $tmp_canvas;
}
}
}
Expand Down Expand Up @@ -313,6 +329,7 @@ protected function defineOptions() {
$options = parent::defineOptions();

$options['iiif_tile_field'] = ['default' => ''];
$options['iiif_ocr_file_field'] = ['default' => ''];

return $options;
}
Expand Down Expand Up @@ -368,6 +385,15 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) {
// otherwise could lock up the form when setting up a View.
'#required' => count($field_options) > 0,
];

$form['iiif_ocr_file_field'] = [
'#title' => $this->t('Structured OCR data file field'),
'#type' => 'checkboxes',
'#default_value' => $this->options['iiif_ocr_file_field'],
'#description' => $this->t('The source of structured OCR text for each entity.'),
'#options' => $field_options,
'#required' => FALSE,
];
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
use Drupal\islandora\Plugin\Action\AbstractGenerateDerivativeMediaFile;

/**
* Emits a Node for generating fits derivatives event.
* Generates OCR derivatives event.
*
* @Action(
* id = "generate_extracted_text_file",
Expand All @@ -29,6 +29,7 @@ public function defaultConfiguration() {
$config['destination_media_type'] = 'file';
$config['scheme'] = $this->config->get('default_scheme');
$config['destination_text_field_name'] = '';
$config['text_format'] = 'plain_text';
return $config;
}

Expand All @@ -38,7 +39,7 @@ public function defaultConfiguration() {
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
$map = $this->entityFieldManager->getFieldMapByFieldType('text_long');
$file_fields = $map['media'];
$field_options = array_combine(array_keys($file_fields), array_keys($file_fields));
$field_options = ['none' => $this->t('None')] + array_combine(array_keys($file_fields), array_keys($file_fields));
$form = parent::buildConfigurationForm($form, $form_state);
$form['mimetype']['#description'] = $this->t('Mimetype to convert to (e.g. application/xml, etc...)');
$form['mimetype']['#value'] = 'text/plain';
Expand All @@ -48,13 +49,23 @@ public function buildConfigurationForm(array $form, FormStateInterface $form_sta
$last = array_slice($form, count($form) - $position + 1);

$middle['destination_text_field_name'] = [
'#required' => TRUE,
'#required' => FALSE,
'#type' => 'select',
'#options' => $field_options,
'#title' => $this->t('Destination Text field Name'),
'#default_value' => $this->configuration['destination_text_field_name'],
'#description' => $this->t('Text field on Media Type to hold extracted text.'),
];
$middle['text_format'] = [
'#type' => 'select',
'#title' => $this->t('Format'),
'#options' => [
'plain_text' => $this->t('Plain text'),
'hocr' => $this->t('hOCR text with positional data'),
],
'#default_value' => $this->configuration['text_format'],
'#description' => $this->t("The type of text to be returned."),
];
$form = array_merge($first, $middle, $last);

unset($form['args']);
Expand All @@ -81,17 +92,29 @@ public function validateConfigurationForm(array &$form, FormStateInterface $form
public function submitConfigurationForm(array &$form, FormStateInterface $form_state) {
parent::submitConfigurationForm($form, $form_state);
$this->configuration['destination_text_field_name'] = $form_state->getValue('destination_text_field_name');
$this->configuration['text_format'] = $form_state->getValue('text_format');
switch ($form_state->getValue('text_format')) {
case 'hocr':
$this->configuration['args'] = '-c tessedit_create_hocr=1 -c hocr_font_info=0';
break;

case 'plain_text':
$his->configuration['args'] = '';
break;
}
}

/**
* Override this to return arbitrary data as an array to be json encoded.
*/
protected function generateData(EntityInterface $entity) {

$data = parent::generateData($entity);
$route_params = [
'media' => $entity->id(),
'destination_field' => $this->configuration['destination_field_name'],
'destination_text_field' => $this->configuration['destination_text_field_name'],
'text_format' => $this->configuration['text_format'],
];
$data['destination_uri'] = Url::fromRoute('islandora_text_extraction.attach_file_to_media', $route_params)
->setAbsolute()
Expand Down