Skip to content

Commit

Permalink
Merge pull request #104 from Mocretion/multimodal
Browse files Browse the repository at this point in the history
Multimodal
  • Loading branch information
abrami authored Oct 8, 2024
2 parents 2c13d2d + 2508d84 commit 62f6610
Show file tree
Hide file tree
Showing 400 changed files with 62,523 additions and 192 deletions.
22 changes: 0 additions & 22 deletions .github/workflows/javadoc.yml

This file was deleted.

19 changes: 19 additions & 0 deletions cite.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
@inproceedings{Leonhardt:et:al:2023,
title = "Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}",
author = "Leonhardt, Alexander and
Abrami, Giuseppe and
Baumartz, Daniel and
Mehler, Alexander",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.29",
doi = "10.18653/v1/2023.findings-emnlp.29",
pages = "385--399",
abstract = "Automatic analysis of large corpora is a complex task, especially in terms of time efficiency. This complexity is increased by the fact that flexible, extensible text analysis requires the continuous integration of ever new tools. Since there are no adequate frameworks for these purposes in the field of NLP, and especially in the context of UIMA, that are not outdated or unusable for security reasons, we present a new approach to address the latter task: Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight, and feature-rich framework for automatic distributed analysis of text corpora that leverages Big Data experience and virtualization with Docker. We evaluate DUUI{'}s communication approach against a state-of-the-art approach and demonstrate its outstanding behavior in terms of time efficiency, enabling the analysis of big text data.",
}
3 changes: 2 additions & 1 deletion docs/Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ source "https://rubygems.org"

gemspec

gem install kramdown-parser-gfm
gem 'kramdown-parser-gfm', '~> 1.1'
gem 'github-pages'

group :jekyll_plugins do
gem 'jekyll-commonmark-ghpages'
Expand Down
8 changes: 6 additions & 2 deletions docs/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@ github:
owner_url: https://github.com/texttechnologylab
owner_name: Texttechnology Lab

#Others
markdown: CommonMarkGhPages
highlighter: rouge
markdown: kramdown
kramdown:
input: GFM

theme: jekyll-theme-architect

mermaid: true

# Exclude from processing.
# The following items will not be processed, by default. Create a custom list
# to override the default setting.
Expand Down
5 changes: 4 additions & 1 deletion docs/_includes/head-custom.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,8 @@

<!-- You can set your favicon here -->
<!-- link rel="shortcut icon" type="image/x-icon" href="{{ '/favicon.ico' | relative_url }}" -->

<!-- script src="https://cdnjs.cloudflare.com/ajax/libs/mermaid/8.0.0/mermaid.min.js"></script-->
<!-- <script src="https://rawcdn.githack.com/oscarmorrison/md-page/232e97938de9f4d79f4110f6cfd637e186b63317/md-page.js"></script> -->
<!-- <noscript> -->
<script src="https://kit.fontawesome.com/a01f8fee95.js" crossorigin="anonymous"></script>
<!-- end custom head snippets -->
12 changes: 10 additions & 2 deletions docs/_layouts/default.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,23 @@ <h2>{{ site.description | default: site.github.project_tagline }}</h2>
</section>

<aside id="sidebar">
<img src="assets/images/DUUI_Logo.png" width="100%">
<img src="https://texttechnologylab.github.io/DockerUnifiedUIMAInterface/assets/images/DUUI_Logo.png" width="100%">
{% if site.github.is_project_page %}
<a href="{{ site.github.repository_url }}" target="_blank" class="button"><small>View project on</small> GitHub</a>
{% endif %}
{% if site.github.is_project_page %}
<a href="{{ site.github.owner_url }}" target="_blank" class="button"><small style="margin-top: -12px;">Powerd by</small>
<img src="assets/images/ttlab.png" width="80%;"></a>
<img src="https://texttechnologylab.github.io/DockerUnifiedUIMAInterface/assets/images/ttlab.png" width="80%;"></a>
<!-- <p class="repo-owner"><a href="{{ site.github.repository_url }}">{{ site.github.repository_name }}</a> is maintained by <a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a>.</p>-->
{% endif %}
<a href="https://github.com/texttechnologylab/DockerUnifiedUIMAInterface/blob/main/cite.bib" target="_blank" class="emptyButton">
<i class="fa-solid fa-quote-left"></i>
Cite
</a>
<a href="https://www.texttechnologylab.org/legal-notice/" target="_blank" class="emptyButton">
<i class="fa-solid fa-section"></i>
Impress
</a>

<p>This page was generated by <a href="https://pages.github.com">GitHub Pages</a>.</p>
</aside>
Expand Down
39 changes: 39 additions & 0 deletions docs/assets/css/style.scss
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,42 @@ header h2{
font-size:2vw;
}

.highlight{
background-color:#fff!important;
}


a.emptyButton {
width: 134px;
height: 58px;
padding-top: 22px;
padding-left: 68px;
font-family: 'Architects Daughter', 'Helvetica Neue', Helvetica, Arial, serif;
font-size: 23px;
line-height: 1.2;
color: #fff;
}
a.emptyButton small {
display: block;
font-size: 11px;
}
header a.emptyButton {
position: absolute;
top: 0;
right: 0;
background: transparent url(../images/empty_button.png) 0 0 no-repeat;
}
aside a.emptyButton {
display: block;
width: 138px;
padding-left: 64px;
margin-bottom: 20px;
font-size: 21px;
background: transparent url(../images/empty_button.png) 0 0 no-repeat;
}

.emptyButton i {
margin-left: -1.2em;
font-size: 2vw;
padding-right: 10pt;
}
Binary file added docs/assets/images/empty_button.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
95 changes: 79 additions & 16 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,25 @@ layout: default
---
Automatic analysis of large text corpora is a complex task. This complexity particularly concerns the question of time efficiency. Furthermore, efficient, flexible, and extensible textanalysis requires the continuous integration of every new text analysis tools. Since there are currently, in the area of NLP and especially in the application context of UIMA, only very few to no adequate frameworks for these purposes, which are not simultaneously outdated or can no longer be used for security reasons, this work will present a new approach to fill this gap. To this end, we present Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight, and featurerich framework for automated and distributed analysis of text corpora that leverages experience in Big Data analytics and virtualization with Docker.


[![](https://jitpack.io/v/texttechnologylab/DockerUnifiedUIMAInterface.svg)](https://jitpack.io/#texttechnologylab/DockerUnifiedUIMAInterface)
![GitHub License](https://img.shields.io/github/license/Texttechnologylab/DockerUnifiedUIMAInterface)
![GitHub release (with filter)](https://img.shields.io/github/v/release/Texttechnologylab/DockerUnifiedUIMAInterface)
[![Conference](http://img.shields.io/badge/conference-FindingsEMNLP--2023-4b44ce.svg)](https://2023.emnlp.org/)
[![Paper](http://img.shields.io/badge/paper-FindingsEMNLP--2023-fb44ce.svg)](https://aclanthology.org/2023.findings-emnlp.29)
![GitHub last commit](https://img.shields.io/github/last-commit/texttechnologylab/DockerUnifiedUIMAInterface)


[![Conference](https://img.shields.io/badge/conference-FindingsEMNLP--2023-4b44ce.svg)](https://2023.emnlp.org/)
[![Paper](https://img.shields.io/badge/paper-FindingsEMNLP--2023-fb44ce.svg)](https://aclanthology.org/2023.findings-emnlp.29)

## Features
# TLDR
* [Javadoc](javadoc)
* [Tutorials](tutorial/Tutorial)
- [Simple Sentiment](tutorial/Sentiment)
- [Advance Hate Check](tutorial/HateCheck)
- [Complex Fact Check](tutorial/FactChecking)
* [Kubernetes-Setup](instructions/Kubernetes)

# Features
Using DUUI, NLP preprocessing on texts can be performed using the following features:
* Horizontal and vertical scaling
* Capturing heterogeneous annotation landscapes
Expand All @@ -18,43 +30,44 @@ Using DUUI, NLP preprocessing on texts can be performed using the following feat
* Monitoring and error-reporting
* Lightweight usability

## Functions
# Functions
DUUI has different components which are distinguished into Drivers and Components.

### Components
## Components
Components represent the actual analysis methods for recognizing (among others) tokens, named entities, POS and other ingredients of the NLP. All components must be analysis methods in the definition of UIMA. Of course, existing analysis methods based on Java can also be used directly (e.g. dkpro).

Independently of this, Components can also be implemented in alternative programming languages, as long as the interface of DUUI is used, they can be targeted and used.

### Driver
## Driver
DUUI has a variety of drivers that enable communication as well as the execution of Components in different runtime environments.

#### UIMADriver
### UIMADriver
The UIMADriver runs a UIMA Analysis Engine (AE) on the local machine (using local memory and processor) in the same process within the JRE and allows scaling on that machine by replicating the underlying Analysis Engine. This enables the use of all previous analysis methods based on UIMA AE without further adjustments.

#### DockerDriver
### DockerDriver
The DUUI core driver runs Components on the local Docker daemon and enables machine-specific resource management. This requires that the AEs are available as Docker images according to DUUI to run as Docker containers. It is not relevant whether the Docker image is stored locally or in a remote registry, since the Docker container is built on startup. This makes it very easy to test new AEs (as local containers) before being released. The distinction between local and remote Docker images is achieved by the URI of the Docker image used

#### RemoteDriver
### RemoteDriver
AEs that are not available as containers and whose models can or should not be shared can still be used if they are available via REST. Since DUUI communicates via RESTful, remote endpoints can be used for pre-processing. In general, AEs implemented based on DUUI can be accessed and used via REST, but the scaling is limited regarding request and processing capabilities of the hosting system. In addition, Components addressed via the RemoteDRiver can be used as services. This has advantages for AEs that need to hold large models in memory and thus require a long startup time. To avoid continuous reloading, it may be necessary to start a service once or twice in a dedicated mode and then use a RemoteDriver to access it. To use services, their URL must be specified to enable horizontal scaling.

#### SwarmDriver
### SwarmDriver
The SwarmDriver complements the DockerDriver; it uses the same function alities, but its AEs are used as Docker images distributed within the Docker Swarm network. A swarm consists of n nodes and is controlled by a leader node within the Docker framework. However, if an application using DUUI is executed on a Docker leader node, the individual AEs can be executed on multiple swarm nodes.

#### KubernetesDriver
### KubernetesDriver
The KubernetesDriver works similarly to the SwarmDriver, but Kubernetes is used as the runtime environment instead of Docker Swarm.

## Requirements
# Requirements
![Java](https://img.shields.io/badge/Java-17-blue)
![Docker](https://img.shields.io/badge/Docker-22.10-green)

## UIMA-Components
# UIMA-Components
A list of existing DUUI components as Docker images can be found [here](https://github.com/texttechnologylab/duui-uima).

> [!NOTE]
> <img src="https://raw.githubusercontent.com/FortAwesome/Font-Awesome/6.x/svgs/solid/circle-info.svg" width="15" height="15"> **Note**
>
> Instructions for creating your own DUUI components and detailed explanations can be found under [Tutorials](tutorial/Tutorial).
## Using
# Using
There are basically two ways to use DUUI for preprocessing texts:
* Clone the GitHub project.
* Include the GitHub project using JitPack via maven (Recommended).
Expand All @@ -69,7 +82,7 @@ Add the following to your pom file:
</repository>
</repositories>
```
After that DUUI can be integrated as a dependency:
After that DUUI can be integrated as a dependency using [![](https://jitpack.io/v/texttechnologylab/DockerUnifiedUIMAInterface.svg)](https://jitpack.io/#texttechnologylab/DockerUnifiedUIMAInterface)

```xml
<dependency>
Expand All @@ -79,6 +92,56 @@ After that DUUI can be integrated as a dependency:
</dependency>
```

## Use with Java

```java
int iWorkers = 2; // define the number of workers

JCas jc = JCasFactory.createJCas(); // A empty CAS document is defined.

// load content into jc ...

// Defining LUA-Context for communication
DUUILuaContext ctx = LuaConsts.getJSON();

// Defining a storage backend based on SQlite.
DUUISqliteStorageBackend sqlite = new DUUISqliteStorageBackend("loggingSQlite.db")
.withConnectionPoolSize(iWorkers);

// The composer is defined and initialized with a standard Lua context as well with a storage backend.
DUUIComposer composer = new DUUIComposer().withLuaContext(ctx)
.withScale(iWorkers).withStorageBackend(sqlite);

// Instantiate drivers with options (example)
DUUIDockerDriver docker_driver = new DUUIDockerDriver()
.withTimeout(10000);
DUUIRemoteDriver remote_driver = new DUUIRemoteDriver(10000);
DUUIUIMADriver uima_driver = new DUUIUIMADriver().withDebug(true);
DUUISwarmDriver swarm_driver = new DUUISwarmDriver();

// A driver must be added before components can be added for it in the composer. After that the composer is able to use the individual drivers.
composer.addDriver(docker_driver, remote_driver, uima_driver, swarm_driver);

// A new component for the composer is added
composer.add(new DUUIDockerDriver.
Component("docker.texttechnologylab.org/gnfinder:latest")
.withScale(iWorkers)
// The image is reloaded and fetched, regardless of whether it already exists locally (optional)
.withImageFetching());

// Adding a UIMA annotator for writing the result of the pipeline as XMI files.
composer.add(new DUUIUIMADriver.Component(
createEngineDescription(XmiWriter.class,
XmiWriter.PARAM_TARGET_LOCATION, sOutputPath,
)).withScale(iWorkers));

// The document is processed through the pipeline. In addition, files of entire repositories can be processed.
composer.run(jc);
```

> <img src="https://raw.githubusercontent.com/FortAwesome/Font-Awesome/6.x/svgs/solid/circle-info.svg" width="15" height="15"> **Note**
>
> Further examples can be found at the [tutorials](tutorial/Tutorial).

# Cite
Expand Down
Loading

0 comments on commit 62f6610

Please sign in to comment.