diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..b7f746ef1
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,5 @@
+BasedOnStyle: Google
+
+# Maximum line length 80 is too low even for 1080p monitor.  @XapaJIaMnu
+# personally would like 120.
+ColumnLimit: 120
diff --git a/.clang-format-ignore b/.clang-format-ignore
new file mode 100644
index 000000000..50795cacb
--- /dev/null
+++ b/.clang-format-ignore
@@ -0,0 +1,4 @@
+3rd_party
+wasm/test_page
+src/translator/aligned.h
+src/translator/pcqueue.h
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..bdbadb624
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,32 @@
+Checks: >
+  .*,
+  bugprone-*,  
+  concurrency-*,
+  google-*,
+  portability-*,
+  performance-*,
+  clang-analyzer-*,
+  readability-*,
+  -readability-implicit-bool-conversion,
+  -readability-isolate-declaration,
+  -readability-uppercase-literal-suffix,
+  misc-*,
+  -misc-noexcept*,
+  modernize-*,
+  -modernize-deprecated-headers,
+  -modernize-use-nodiscard,
+  -modernize-raw-string-literal,
+  -modernize-return-braced-init-list,
+  -modernize-use-equals-delete,
+  -modernize-use-trailing-return-type,
+
+
+
+CheckOptions:
+  - { key: readability-identifier-naming.ClassCase,     value: CamelCase  }
+  - { key: readability-identifier-naming.ClassMethodCase, value: camelBack  }
+  - { key: readability-identifier-naming.VariableCase,  value: camelBack  }
+  - { key: readability-identifier-naming.FunctionCase,  value: camelBack  }
+  - { key: readability-identifier-naming.PrivateMemberSuffix,  value: _   }
+  - { key: readability-identifier-naming.ParameterCase, value: camelBack  }
+
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 26f6f4418..fa4f321cf 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,5 +1,31 @@
+# Firefox Translations review group
+.dockerignore @mozilla/firefox-translations
+.github @mozilla/firefox-translations
+.gitignore @mozilla/firefox-translations
+.gitmodules @mozilla/firefox-translations
+docker @mozilla/firefox-translations
+docs @mozilla/firefox-translations
+utils @mozilla/firefox-translations
+CODE_OF_CONDUCT.md @mozilla/firefox-translations
+LICENSE @mozilla/firefox-translations
+poetry.lock @mozilla/firefox-translations
+pyproject.toml @mozilla/firefox-translations
+README.md @mozilla/firefox-translations
+Taskfile.yml @mozilla/firefox-translations
+
+# Translations Training review group
+configs @mozilla/translations-training
+pipeline @mozilla/translations-training
+snakemake @mozilla/translations-training
+tests @mozilla/translations-training
+tracking @mozilla/translations-training
+
+# Translations Inference review group
+inference-engine @mozilla/translations-inference
+
 # Taskcluster pipeline related files. Changes to these ought to be reviewed by
 # RelEng to watch for security issues and best practices. These should also
 # be reviewed by people familiar with the pipeline itself.
-.taskcluster.yml @mozilla/releng
-taskcluster @mozilla/releng
+.taskcluster.yml @mozilla/releng @mozilla/translations-training
+taskcluster @mozilla/releng @mozilla/translations-training
+
diff --git a/.gitmodules b/.gitmodules
index f1813a444..f88221eb5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -16,3 +16,29 @@
 [submodule "3rd_party/preprocess"]
 	path = 3rd_party/preprocess
 	url = https://github.com/kpu/preprocess.git
+[submodule "inference/3rd_party/ssplit-cpp"]
+	path = inference/3rd_party/ssplit-cpp
+	url = https://github.com/browsermt/ssplit-cpp
+# This is the same dependency and repository as `3rd_party/browsermt-marian-dev` below.
+#
+# When forking `inference-engine` into to this project, I made an earnest attempt to utilize the preexisting
+# `3rd_party/browsermt-marian-dev` submodule within `inference-engine`. Unfortunately, I ran into several roadblocks:
+#
+#   1) I cannot directly add `3rd_party/browsermt-marian-dev` as a cmake subdirectory because cmake is aware that
+#      this path is not a subdirectory of the `inference-engine` project root.
+#
+#   2) Symbolic links do not appear to work for git submodule direcotires the way that they do for regular directories.
+#      Even if the symbolic link had linked correctly, it may have still failed due to the considerations of 1).
+#
+#   3) I tried using cmake to copy the files from `3rd_party/browsermt-marian-dev` into `inference-engine/3rd_party/browsermt-marian-dev`
+#      at build time, which would ensure that there is no duplicate reference to the URL in this file, however the upstream dependency itself
+#      has hard-coded expectations that the `.git` directory is only one level up, which appears to work correctly for the way git submodules are
+#      configured, but does not work if the files are copied over to a regular directory deeper in the repository's directory tree.
+#
+# It may be possible to remove `3rd_party/browsermt-marian-dev` to instead use `inference-engine/3rd-party/browsermt-marian-dev` everywhere
+# within this repository, but I will leave that for a future commit if there is a need to do so.
+#
+# TODO(#869)
+[submodule "inference/3rd_party/browsermt-marian-dev"]
+	path = inference/3rd_party/browsermt-marian-dev
+	url = https://github.com/browsermt/marian-dev
diff --git a/Taskfile.yml b/Taskfile.yml
index c767924e1..745c5f05c 100644
--- a/Taskfile.yml
+++ b/Taskfile.yml
@@ -75,6 +75,30 @@ tasks:
     cmds:
       - poetry run opuscleaner-server serve --host=0.0.0.0 --port=8000
 
+  inference-clean:
+    desc: Clean build artifacts from the inference directory.
+    cmds:
+      - >-
+          task docker-run -- ./inference/scripts/clean.sh
+
+  inference-build:
+    desc: Build inference engine.
+    cmds:
+      - >-
+          task docker-run -- ./inference/scripts/build-local.sh
+
+  inference-test:
+    desc: Run inference tests.
+    cmds:
+      - >-
+          task docker-run -- ./inference/scripts/unit-tests.sh
+
+  inference-build-wasm:
+    desc: Build inference engine WASM.
+    cmds:
+      - >-
+          task docker-run -- ./inference/scripts/build-wasm.sh
+
   lint-black:
     desc: Checks the styling of the Python code with Black.
     deps: [poetry-install-black]
diff --git a/inference/.gitignore b/inference/.gitignore
new file mode 100644
index 000000000..78202d979
--- /dev/null
+++ b/inference/.gitignore
@@ -0,0 +1,30 @@
+# vim temporary files
+*.swp
+*.swo
+
+# CMake
+CMakeLists.txt.user
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Testing
+Makefile
+cmake_install.cmake
+install_manifest.txt
+compile_commands.json
+CTestTestfile.cmake
+_deps
+
+
+wasm/test_page/node_modules
+/build
+/build-local
+/build-native
+/build-wasm
+/emsdk
+models
+wasm/module/worker/bergamot-translator-worker.*
+wasm/module/browsermt-bergamot-translator-*.tgz
+
+# VSCode
+.vscode
diff --git a/inference/3rd_party/CMakeLists.txt b/inference/3rd_party/CMakeLists.txt
new file mode 100644
index 000000000..62ba02722
--- /dev/null
+++ b/inference/3rd_party/CMakeLists.txt
@@ -0,0 +1,32 @@
+# browsermt-marian-dev is tested elsewhere in both paths, turning off here.
+set(COMPILE_TESTS OFF)
+add_subdirectory(browsermt-marian-dev EXCLUDE_FROM_ALL)
+
+if(COMPILE_WASM)
+  # This is a bad way of adding compilation flags. Will be improved soon.
+  add_compile_options(${WASM_COMPILE_FLAGS})
+  add_link_options(${WASM_LINK_FLAGS})
+endif(COMPILE_WASM)
+
+add_subdirectory(ssplit-cpp EXCLUDE_FROM_ALL)
+
+# Add include directories for 3rd party targets to be able to use it anywhere in the
+# project without explicitly specifying their include directories. Once they
+# fixe this problem, it can be removed.
+get_property(INCDIRS DIRECTORY browsermt-marian-dev/src PROPERTY INCLUDE_DIRECTORIES)
+target_include_directories(marian PUBLIC ${INCDIRS})
+
+get_property(INCLUDE_DIRECTORIES DIRECTORY ssplit-cpp/src PROPERTY INCLUDE_DIRECTORIES)
+target_include_directories(ssplit PUBLIC ${INCLUDE_DIRECTORIES})
+
+get_property(COMPILE_DEFINITIONS DIRECTORY browsermt-marian-dev PROPERTY COMPILE_DEFINITIONS) 
+target_compile_definitions(marian PUBLIC ${COMPILE_DEFINITIONS})
+
+get_property(COMPILE_OPTIONS DIRECTORY browsermt-marian-dev PROPERTY COMPILE_OPTIONS) 
+target_compile_options(marian PUBLIC ${COMPILE_OPTIONS})
+
+# Compilation flags 
+get_directory_property(CMAKE_C_FLAGS DIRECTORY browsermt-marian-dev DEFINITION CMAKE_C_FLAGS) 
+get_directory_property(CMAKE_CXX_FLAGS DIRECTORY browsermt-marian-dev DEFINITION CMAKE_CXX_FLAGS) 
+set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} PARENT_SCOPE)    
+set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} PARENT_SCOPE)
diff --git a/inference/3rd_party/browsermt-marian-dev b/inference/3rd_party/browsermt-marian-dev
new file mode 160000
index 000000000..2781d735d
--- /dev/null
+++ b/inference/3rd_party/browsermt-marian-dev
@@ -0,0 +1 @@
+Subproject commit 2781d735d4a10dca876d61be587afdab2726293c
diff --git a/inference/3rd_party/ssplit-cpp b/inference/3rd_party/ssplit-cpp
new file mode 160000
index 000000000..a311f9865
--- /dev/null
+++ b/inference/3rd_party/ssplit-cpp
@@ -0,0 +1 @@
+Subproject commit a311f9865ade34db1e8e080e6cc146f55dafb067
diff --git a/inference/BERGAMOT_VERSION b/inference/BERGAMOT_VERSION
new file mode 100644
index 000000000..a423f7f06
--- /dev/null
+++ b/inference/BERGAMOT_VERSION
@@ -0,0 +1 @@
+v0.4.5
diff --git a/inference/CMakeLists.txt b/inference/CMakeLists.txt
new file mode 100644
index 000000000..febff3e6e
--- /dev/null
+++ b/inference/CMakeLists.txt
@@ -0,0 +1,188 @@
+cmake_minimum_required(VERSION 3.5.1)
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+if (POLICY CMP0074)
+  cmake_policy(SET CMP0074 NEW) # CMake 3.12
+endif ()
+
+if (POLICY CMP0077)
+  cmake_policy(SET CMP0077 NEW)
+endif()
+
+project(bergamot_translator CXX C)
+
+# Retrieve the parent-directory path of PROJECT_SOURCE_DIR and assign that to REPOSITORY_ROOT_DIR.
+cmake_path(GET PROJECT_SOURCE_DIR PARENT_PATH REPOSITORY_ROOT_DIR)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Generate a compile_commands.json in the build directory. The compile commands allow
+# code editors to understand the build process and provide static analysis of the code.
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Note that with CMake MSVC build, the option CMAKE_BUILD_TYPE is automatically derived from the key
+# 'configurationType' in CMakeSettings.json configurations
+if(NOT CMAKE_BUILD_TYPE)
+  message(WARNING "CMAKE_BUILD_TYPE not set; setting to Release")
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+if(NOT COMPILE_WASM)
+  # Setting BUILD_ARCH to native invokes CPU intrinsic detection logic below.
+  # Prevent invoking that logic for WASM builds.
+  set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
+
+  # Unfortunately MSVC supports a limited subset of BUILD_ARCH flags. Instead try to guess
+  # what architecture we can compile to reading BUILD_ARCH and mapping it to MSVC values
+  # references: https://clang.llvm.org/docs/UsersManual.html https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/i386-and-x86-64-Options.html
+  # https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86?redirectedfrom=MSDN&amp;amp;view=vs-2019&view=msvc-170 https://devblogs.microsoft.com/oldnewthing/20201026-00/?p=104397
+  # This is by no means an exhaustive list but should match the most common flags Linux programmers expect to parse to MSVC
+  if(MSVC)
+    if(BUILD_ARCH STREQUAL "native") # avx2 is good default for native. Very few desktop systems support avx512
+      set(MSVC_BUILD_ARCH "/arch:AVX2")
+    elseif(BUILD_ARCH STREQUAL "skylake-avx512" OR BUILD_ARCH STREQUAL "cannonlake" OR BUILD_ARCH STREQUAL "x86-64-v4" OR BUILD_ARCH STREQUAL "tigerlake" OR BUILD_ARCH STREQUAL "cooperlake" OR BUILD_ARCH STREQUAL "cascadelake")
+      set(MSVC_BUILD_ARCH "/arch:AVX512")
+    elseif(BUILD_ARCH STREQUAL "core-avx2" OR BUILD_ARCH STREQUAL "haswell" OR BUILD_ARCH STREQUAL "x86-64-v3" OR BUILD_ARCH STREQUAL "broadwell" OR BUILD_ARCH STREQUAL "skylake")
+      set(MSVC_BUILD_ARCH "/arch:AVX2")
+    elseif(BUILD_ARCH STREQUAL "sandybridge" OR BUILD_ARCH STREQUAL "corei7-avx" OR BUILD_ARCH STREQUAL "core-avx-i" OR BUILD_ARCH STREQUAL "ivybridge")
+      set(MSVC_BUILD_ARCH "/arch:AVX")
+    elseif(BUILD_ARCH STREQUAL "nehalem" OR BUILD_ARCH STREQUAL "westmere" OR BUILD_ARCH STREQUAL "x86-64-v2" OR BUILD_ARCH STREQUAL "corei7" OR BUILD_ARCH STREQUAL "core2")
+      set(MSVC_BUILD_ARCH "/arch:SSE2") # This is MSVC default. We won't go down to SSE because we don't support that hardware at all with intgemm. Marian recommends to only go down to SSE4.1 at most
+    else()
+      message(WARNING "Unknown BUILD_ARCH ${BUILD_ARCH} provided. Default to SSE2 for Windows build")
+      set(MSVC_BUILD_ARCH "/arch:SSE2")
+    endif()
+  endif(MSVC)
+endif()
+
+#MSVC can't seem to pick up correct flags otherwise:
+if(MSVC)
+  add_definitions(-DUSE_SSE2=1) # Supposed to fix something in the sse_mathfun.h but not sure it does
+  set(INTRINSICS ${MSVC_BUILD_ARCH}) # ARCH we're targetting on win32. @TODO variable
+
+  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj")
+  set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /MP /GL /DNDEBUG")
+  set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
+
+  # ignores warning LNK4049: locally defined symbol free imported - this comes from zlib
+  set(CMAKE_EXE_LINKER_FLAGS         "${CMAKE_EXE_LINKER_FLAGS} /DEBUG /LTCG:incremental /INCREMENTAL:NO /ignore:4049")
+  set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT")
+  set(CMAKE_EXE_LINKER_FLAGS_DEBUG   "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRTD")
+  set(CMAKE_STATIC_LINKER_FLAGS      "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental")
+endif(MSVC)
+
+include(CMakeDependentOption)
+
+# Project specific cmake options
+option(COMPILE_WASM "Compile for WASM" OFF)
+cmake_dependent_option(USE_WASM_COMPATIBLE_SOURCE "Use wasm compatible sources" OFF "NOT COMPILE_WASM" ON)
+
+# WASM disables a million libraries, which also includes the unit test-library.
+cmake_dependent_option(COMPILE_UNIT_TESTS "Compile unit tests" OFF "USE_WASM_COMPATIBLE_SOURCE" ON)
+option(COMPILE_TESTS "Compile bergamot-tests" OFF)
+cmake_dependent_option(ENABLE_CACHE_STATS "Enable stats on cache" ON "COMPILE_TESTS" OFF)
+
+
+# Set 3rd party submodule specific cmake options for this project
+SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
+SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
+SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs")
+SET(SSPLIT_COMPILE_LIBRARY_ONLY ON CACHE BOOL "Do not compile ssplit tests")
+if (USE_WASM_COMPATIBLE_SOURCE)
+  SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.")
+  SET(USE_MKL OFF CACHE BOOL "Compile with MKL support")
+  # # Setting the ssplit-cpp submodule specific cmake options for wasm
+  SET(SSPLIT_USE_INTERNAL_PCRE2 ON CACHE BOOL "Use internal PCRE2 instead of system PCRE2")
+endif()
+
+# Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html
+# Ensures the submodules are set correctly during a build.
+find_package(Git QUIET)
+if(GIT_FOUND AND EXISTS "${REPOSITORY_ROOT_DIR}/.git")
+# Update submodules as needed
+    option(GIT_SUBMODULE "Check submodules during build" ON)
+    if(GIT_SUBMODULE)
+        message(STATUS "Submodule update")
+        execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
+                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                        RESULT_VARIABLE GIT_SUBMOD_RESULT)
+        if(NOT GIT_SUBMOD_RESULT EQUAL "0")
+            message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
+        endif()
+    endif()
+endif()
+
+# Project versioning
+include(GetVersionFromFile)
+message(STATUS "Project name: ${PROJECT_NAME}")
+message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
+
+if(COMPILE_WASM)
+  # See https://github.com/emscripten-core/emscripten/blob/main/src/settings.js
+  list(APPEND WASM_COMPILE_FLAGS
+    -O3
+    # Preserve whitespaces in JS even for release builds; this doesn't increase wasm binary size
+    $<$<CONFIG:Release>:-g1>
+    # Relevant Debug info only for release with debug builds as this increases wasm binary size
+    $<$<CONFIG:RelWithDebInfo>:-g2>
+    -fPIC
+    -mssse3
+    -msimd128
+    # -fno-exceptions # Can't do that because spdlog uses exceptions
+    -sDISABLE_EXCEPTION_CATCHING=1
+    -sSTRICT=1
+  )
+  list(APPEND WASM_LINK_FLAGS
+    -O3
+    # Preserve whitespaces in JS even for release builds; this doesn't increase wasm binary size
+    $<$<CONFIG:Release>:-g1>
+    # Relevant Debug info only for release with debug builds as this increases wasm binary size
+    $<$<CONFIG:RelWithDebInfo>:-g2>
+    -lembind
+    # Save some code, and some speed
+    -sASSERTIONS=0
+    -sDISABLE_EXCEPTION_CATCHING=1
+    # the intgemm functions we call will be undefined since these are linked at
+    # runtime by our own javascript.
+    -sLLD_REPORT_UNDEFINED
+    -sERROR_ON_UNDEFINED_SYMBOLS=0
+    # Cause we can!
+    -sSTRICT=1
+    # You know we need it
+    -sALLOW_MEMORY_GROWTH=1
+    -sENVIRONMENT=web,worker
+    # No need to call main(), there's nothing there.
+    -sINVOKE_RUN=0
+    # No need for filesystem code in the generated Javascript
+    -sFILESYSTEM=0
+    # If you turn this on, it will mangle names which makes the dynamic linking hard.
+    -sDECLARE_ASM_MODULE_EXPORTS=0
+    # Export all of the intgemm functions in case we need to fall back to using the embedded intgemm
+    -sEXPORTED_FUNCTIONS=[_int8PrepareAFallback,_int8PrepareBFallback,_int8PrepareBFromTransposedFallback,_int8PrepareBFromQuantizedTransposedFallback,_int8PrepareBiasFallback,_int8MultiplyAndAddBiasFallback,_int8SelectColumnsOfBFallback]
+    # Necessary for mozintgemm linking. This prepares the `wasmMemory` variable ahead of time as
+    # opposed to delegating that task to the wasm binary itself. This way we can link MozIntGEMM
+    # module to the same memory as the main bergamot-translator module.
+    -sIMPORTED_MEMORY=1
+    # Dynamic execution is either frowned upon or blocked inside browser extensions
+    -sDYNAMIC_EXECUTION=0
+  )
+endif(COMPILE_WASM)
+
+# Needs to be enabled before including the folder containing tests (src/tests)
+if(COMPILE_TESTS)
+    enable_testing()
+endif(COMPILE_TESTS)
+
+add_subdirectory(3rd_party)
+add_subdirectory(src)
+
+if(COMPILE_WASM)
+  add_subdirectory(wasm)
+endif(COMPILE_WASM)
+
+option(COMPILE_PYTHON "Compile python bindings. Intended to be activated with setup.py" OFF)
+if(COMPILE_PYTHON)
+  add_subdirectory(bindings/python)
+endif(COMPILE_PYTHON)
+
diff --git a/inference/cmake/GetVersionFromFile.cmake b/inference/cmake/GetVersionFromFile.cmake
new file mode 100644
index 000000000..47c35bc23
--- /dev/null
+++ b/inference/cmake/GetVersionFromFile.cmake
@@ -0,0 +1,60 @@
+##
+# This CMake modules sets the project version from a version file.
+#
+# The module sets the following variables:
+#
+# * PROJECT_VERSION_STRING
+# * PROJECT_VERSION_STRING_FULL
+# * PROJECT_VERSION_MAJOR
+# * PROJECT_VERSION_MINOR
+# * PROJECT_VERSION_PATCH
+# * PROJECT_VERSION_TWEAK
+# * PROJECT_VERSION_GIT_SHA
+#
+# This module is public domain, use it as it fits you best.
+##
+
+# Get full string version from file
+if(PROJECT_VERSION_FILE)
+  file(STRINGS ${PROJECT_VERSION_FILE} PROJECT_VERSION_STRING)
+else()
+  file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/BERGAMOT_VERSION PROJECT_VERSION_STRING)
+endif()
+
+# Get current commit SHA from git
+execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  OUTPUT_VARIABLE PROJECT_VERSION_GIT_SHA
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# Get partial versions into a list
+string(REGEX MATCHALL "-.*$|[0-9]+" PROJECT_PARTIAL_VERSION_LIST
+  ${PROJECT_VERSION_STRING})
+
+# Set the version numbers
+list(GET PROJECT_PARTIAL_VERSION_LIST 0 PROJECT_VERSION_MAJOR)
+list(GET PROJECT_PARTIAL_VERSION_LIST 1 PROJECT_VERSION_MINOR)
+list(GET PROJECT_PARTIAL_VERSION_LIST 2 PROJECT_VERSION_PATCH)
+
+# The tweak part is optional, so check if the list contains it
+list(LENGTH PROJECT_PARTIAL_VERSION_LIST PROJECT_PARTIAL_VERSION_LIST_LEN)
+if(PROJECT_PARTIAL_VERSION_LIST_LEN GREATER 3)
+  list(GET PROJECT_PARTIAL_VERSION_LIST 3 PROJECT_VERSION_TWEAK)
+  string(SUBSTRING ${PROJECT_VERSION_TWEAK} 1 -1 PROJECT_VERSION_TWEAK)
+endif()
+
+# Unset the list
+unset(PROJECT_PARTIAL_VERSION_LIST)
+
+# Set full project version string
+set(PROJECT_VERSION_STRING_FULL
+  ${PROJECT_VERSION_STRING}+${PROJECT_VERSION_GIT_SHA})
+
+# Print all variables for debugging
+#message(STATUS ${PROJECT_VERSION_STRING_FULL})
+#message(STATUS ${PROJECT_VERSION_STRING})
+#message(STATUS ${PROJECT_VERSION_MAJOR})
+#message(STATUS ${PROJECT_VERSION_MINOR})
+#message(STATUS ${PROJECT_VERSION_PATCH})
+#message(STATUS ${PROJECT_VERSION_TWEAK})
+#message(STATUS ${PROJECT_VERSION_GIT_SHA})
diff --git a/inference/examples/run-native.sh b/inference/examples/run-native.sh
new file mode 100644
index 000000000..84e1302f0
--- /dev/null
+++ b/inference/examples/run-native.sh
@@ -0,0 +1,19 @@
+# In source-root folder
+
+# Obtain an example model from the web.
+mkdir -p models
+wget --quiet --continue --directory models/ \
+    https://data.statmt.org/bergamot/models/deen/ende.student.tiny11.v2.93821e13b3c511b5.tar.gz
+(cd models && tar -xzf ende.student.tiny11.v2.93821e13b3c511b5.tar.gz)
+
+# Patch the config-files generated from marian for use in bergamot.
+python3 bergamot-translator-tests/tools/patch-marian-for-bergamot.py \
+    --config-path models/ende.student.tiny11/config.intgemm8bitalpha.yml \
+    --ssplit-prefix-file $(realpath 3rd_party/ssplit-cpp/nonbreaking_prefixes/nonbreaking_prefix.en)
+
+# Patched config file will be available with .bergamot.yml suffix.
+CONFIG=models/ende.student.tiny11/config.intgemm8bitalpha.yml.bergamot.yml
+
+build/app/bergamot --model-config-paths $CONFIG --cpu-threads 4 <<< "Hello World!"
+# Hallo Welt!
+
diff --git a/inference/patches/01-marian-fstream-for-macos.patch b/inference/patches/01-marian-fstream-for-macos.patch
new file mode 100644
index 000000000..6b521ba7e
--- /dev/null
+++ b/inference/patches/01-marian-fstream-for-macos.patch
@@ -0,0 +1,13 @@
+diff --git a/3rd_party/browsermt-marian-dev/src/3rd_party/zstr/strict_fstream.hpp b/3rd_party/browsermt-marian-dev/src/3rd_party/zstr/strict_fstream.hpp
+index 7b1173931df977e69021f3995fa064a492f89d38..948e91eaf99b6b29ce41cf793fba6717f3b5f5b5 100644
+--- a/3rd_party/browsermt-marian-dev/src/3rd_party/zstr/strict_fstream.hpp
++++ b/3rd_party/browsermt-marian-dev/src/3rd_party/zstr/strict_fstream.hpp
+@@ -27,7 +27,7 @@ static std::string strerror()
+     {
+         buff = "Unknown error";
+     }
+-#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || __APPLE__) && ! _GNU_SOURCE
++#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || __APPLE__)
+ // XSI-compliant strerror_r()
+     if (strerror_r(errno, &buff[0], buff.size()) != 0)
+     {
diff --git a/inference/scripts/build-local.sh b/inference/scripts/build-local.sh
new file mode 100755
index 000000000..ae64689fe
--- /dev/null
+++ b/inference/scripts/build-local.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -e
+
+# Run script from the context of inference directory
+cd "$(dirname $0)/.."
+
+# Ensure script is running within docker
+./scripts/detect-docker.sh inference-build
+
+# Return the number of available CPUs, or default to 1 if nproc is unavailable.
+detect_cpus() {
+  if command -v nproc >/dev/null 2>&1; then
+    nproc
+  else
+    echo 1
+  fi
+}
+
+# Parse command-line arguments for the --test flag
+COMPILE_TESTS=OFF
+while [[ "$#" -gt 0 ]]; do
+  case $1 in
+    "--test") COMPILE_TESTS=ON ;;
+    *) echo "Unknown parameter passed: $1"; exit 1 ;;
+  esac
+  shift
+done
+
+if [ ! -d "build-local" ]; then
+  echo "Creating build-local directory..."
+  mkdir build-local
+else
+  echo "build-local directory already exists. Skipping creation."
+fi
+
+cd build-local || exit
+
+# Run cmake with optional COMPILE_TESTS flag
+echo "Running cmake for build-local..."
+if [ "$COMPILE_TESTS" = "ON" ]; then
+  cmake ../ -DCOMPILE_TESTS=ON
+else
+  cmake ../
+fi
+
+# Run make using the detected number of CPUs
+CPUS=$(detect_cpus)
+echo "Running make for build-local with $CPUS CPUs..."
+make -j ${CPUS}
+
diff --git a/inference/scripts/build-wasm.sh b/inference/scripts/build-wasm.sh
new file mode 100755
index 000000000..c21eea985
--- /dev/null
+++ b/inference/scripts/build-wasm.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+set -e
+
+# Run script from the context of inference directory
+cd "$(dirname $0)/.."
+
+# Ensure script is running within docker
+./scripts/detect-docker.sh inference-build-wasm
+
+set -x
+
+# Prerequisite: Download and Install Emscripten using following instructions (unless the EMSDK env var is already set)
+if [ "$EMSDK" == "" ]; then
+  EMSDK_UPDATE_REQUIRED=0
+  if [ ! -d "emsdk" ]; then
+    git clone https://github.com/emscripten-core/emsdk.git
+    EMSDK_UPDATE_REQUIRED=1
+  else
+    cd emsdk
+    git fetch
+    # Only pull if necessary
+    if [ $(git rev-parse HEAD) != $(git rev-parse @{u}) ]; then
+      git pull --ff-only
+      EMSDK_UPDATE_REQUIRED=1
+    fi
+    cd -
+  fi
+  if [ "$EMSDK_UPDATE_REQUIRED" == "1" ]; then
+    cd emsdk
+    ./emsdk install 3.1.8
+    ./emsdk activate 3.1.8
+    cd -
+  fi
+  source ./emsdk/emsdk_env.sh
+fi
+
+# Compile
+#    1. Create a folder where you want to build all the artifacts and compile
+BUILD_DIRECTORY="build-wasm"
+if [ ! -d ${BUILD_DIRECTORY} ]; then
+  mkdir ${BUILD_DIRECTORY}
+fi
+cd ${BUILD_DIRECTORY}
+
+emcmake cmake -DCOMPILE_WASM=on ../
+emmake make -j2
+
+#     2. Import GEMM library from a separate wasm module
+bash ../wasm/patch-artifacts-import-gemm-module.sh
+
+set +x
+echo ""
+echo "Build complete"
+echo ""
+echo "  ./build-wasm/bergamot-translator-worker.js"
+echo "  ./build-wasm/bergamot-translator-worker.wasm"
+
+WASM_SIZE=$(wc -c bergamot-translator-worker.wasm | awk '{print $1}')
+GZIP_SIZE=$(gzip -c bergamot-translator-worker.wasm | wc -c | xargs) # xargs trims the whitespace
+
+# Convert it to human readable.
+WASM_SIZE="$(awk 'BEGIN {printf "%.2f",'$WASM_SIZE'/1048576}')M ($WASM_SIZE bytes)"
+GZIP_SIZE="$(awk 'BEGIN {printf "%.2f",'$GZIP_SIZE'/1048576}')M ($GZIP_SIZE bytes)"
+
+echo "  Uncompressed wasm size: $WASM_SIZE"
+echo "  Compressed wasm size: $GZIP_SIZE"
+
+# The artifacts (.js and .wasm files) will be available in the build directory
+exit 0
diff --git a/inference/scripts/clean.sh b/inference/scripts/clean.sh
new file mode 100755
index 000000000..73f5ae5eb
--- /dev/null
+++ b/inference/scripts/clean.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set -e
+
+# Run script from the context of inference directory
+cd "$(dirname $0)/.."
+
+# Ensure script is running within docker
+./scripts/detect-docker.sh inference-clean
+
+# List of directories to clean
+dirs=("build-local" "build-wasm" "emsdk")
+
+# Flag to track if any directories were cleaned
+cleaned=false
+
+# Check and remove directories
+for dir in "${dirs[@]}"; do
+    if [ -d "$dir" ]; then
+        echo "Removing $dir..."
+        rm -rf "$dir"
+        cleaned=true
+    fi
+done
+
+# If no directories were cleaned, print a message
+if [ "$cleaned" = false ]; then
+    echo "Nothing to clean"
+fi
+
diff --git a/inference/scripts/detect-docker.sh b/inference/scripts/detect-docker.sh
new file mode 100755
index 000000000..c1065349a
--- /dev/null
+++ b/inference/scripts/detect-docker.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+help_task=$1
+
+if [ -z "${IS_DOCKER}" ]; then
+  if [ "${ALLOW_RUN_ON_HOST}" != "1" ]; then
+    echo >&2
+    echo "Error: This script needs to be run inside Docker, or you must set ALLOW_RUN_ON_HOST=1." >&2
+    echo >&2
+    if [ -n "${help_task}" ]; then
+      echo " Help: To run this script directly in docker, run: task ${help_task}" >&2
+    fi
+    echo " Help: To enter docker, run: task docker" >&2
+    exit 1
+  else
+    echo >&2
+    echo "ALLOW_RUN_ON_HOST is set to 1. Continuing..." >&2
+  fi
+fi
diff --git a/inference/scripts/unit-tests.sh b/inference/scripts/unit-tests.sh
new file mode 100755
index 000000000..dd8be9925
--- /dev/null
+++ b/inference/scripts/unit-tests.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -e
+
+# Run script from the context of inference directory
+cd "$(dirname $0)/.."
+
+# Ensure script is running within docker
+./scripts/detect-docker.sh inference-test
+
+# Check if build-local/src/tests/units directory exists
+if [ ! -d "build-local/src/tests/units" ]; then
+    echo "Directory build-local/src/tests/units does not exist. Running build."
+    ./scripts/build-local.sh --test
+else
+    echo "Directory build-local/src/tests/units already exists. Skipping build."
+fi
+
+# Change to the unit tests directory
+cd build-local/src/tests/units
+
+# List of test commands
+tests=(
+    "./run_annotation_tests"
+    "./run_cache_tests"
+    "./run_html_tests"
+    "./run_quality_estimator_tests"
+    "./run_xh_scanner_tests"
+)
+
+# Run all tests, collect failures
+failures=0
+
+for test in "${tests[@]}"; do
+    echo "Running $test..."
+    if ! $test; then
+        echo "$test failed!"
+        failures=$((failures + 1))
+    fi
+done
+
+# If any test failed, exit with a non-zero status
+if [ $failures -gt 0 ]; then
+    echo "$failures test(s) failed."
+    exit 1
+else
+    echo "All tests passed successfully."
+    exit 0
+fi
+
diff --git a/inference/src/CMakeLists.txt b/inference/src/CMakeLists.txt
new file mode 100644
index 000000000..856831be9
--- /dev/null
+++ b/inference/src/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(translator)
+
+if (COMPILE_TESTS)
+    add_subdirectory(tests)
+endif(COMPILE_TESTS)
+
diff --git a/inference/src/tests/CMakeLists.txt b/inference/src/tests/CMakeLists.txt
new file mode 100644
index 000000000..cd0e4c777
--- /dev/null
+++ b/inference/src/tests/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Unit tests
+
+# Include Catch explicitly from marian.
+set(CATCH_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/3rd_party/browsermt-marian-dev/3rd-party)
+add_library(Catch INTERFACE)
+target_include_directories(Catch INTERFACE ${CATCH_INCLUDE_DIR})
+
+if (COMPILE_UNIT_TESTS)
+    add_subdirectory(units)
+endif (COMPILE_UNIT_TESTS)
+
+
+
+if(NOT MSVC)
+  # Testing apps
+  set(TEST_BINARIES async blocking intgemm-resolve wasm)
+  foreach(binary ${TEST_BINARIES})
+      add_executable("${binary}" "${binary}.cpp")
+      target_link_libraries("${binary}" bergamot-translator)
+      set_target_properties("${binary}" PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests/")
+  endforeach(binary)
+
+endif(NOT MSVC)
+
diff --git a/inference/src/tests/async.cpp b/inference/src/tests/async.cpp
new file mode 100644
index 000000000..25ba334ae
--- /dev/null
+++ b/inference/src/tests/async.cpp
@@ -0,0 +1,27 @@
+#include "common.h"
+#include "translator/parser.h"
+#include "translator/service.h"
+#include "translator/translation_model.h"
+
+using namespace marian::bergamot;
+
+int main(int argc, char *argv[]) {
+  ConfigParser<AsyncService> configParser("AsyncService test-suite", /*multiOpMode=*/true);
+  configParser.parseArgs(argc, argv);
+  auto &config = configParser.getConfig();
+
+  AsyncService service(config.serviceConfig);
+
+  std::vector<std::shared_ptr<TranslationModel>> models;
+
+  for (auto &modelConfigPath : config.modelConfigPaths) {
+    TranslationModel::Config modelConfig = parseOptionsFromFilePath(modelConfigPath);
+    std::shared_ptr<TranslationModel> model = service.createCompatibleModel(modelConfig);
+    models.push_back(model);
+  }
+
+  TestSuite<AsyncService> testSuite(service);
+  testSuite.run(config.opMode, models);
+
+  return 0;
+}
diff --git a/inference/src/tests/blocking.cpp b/inference/src/tests/blocking.cpp
new file mode 100644
index 000000000..3bbb45634
--- /dev/null
+++ b/inference/src/tests/blocking.cpp
@@ -0,0 +1,25 @@
+#include "common.h"
+using namespace marian::bergamot;
+
+int main(int argc, char *argv[]) {
+  ConfigParser<BlockingService> configParser("BlockingService test-suite", /*multiOpMode=*/true);
+  configParser.parseArgs(argc, argv);
+
+  auto &config = configParser.getConfig();
+  BlockingService service(config.serviceConfig);
+
+  TestSuite<BlockingService> testSuite(service);
+  std::vector<std::shared_ptr<TranslationModel>> models;
+
+  for (auto &modelConfigPath : config.modelConfigPaths) {
+    TranslationModel::Config modelConfig = parseOptionsFromFilePath(modelConfigPath);
+    std::shared_ptr<TranslationModel> model = std::make_shared<TranslationModel>(modelConfig);
+    models.push_back(model);
+  }
+
+  /// WASM is one special case where WASM path is being checked, involving translateMultiple and a multi-line feed.
+  /// Hence we do not bind it at a single input-blob single Response constraint imposed by the TestSuite.
+  testSuite.run(config.opMode, models);
+
+  return 0;
+}
diff --git a/inference/src/tests/common-impl.cpp b/inference/src/tests/common-impl.cpp
new file mode 100644
index 000000000..431ddaa71
--- /dev/null
+++ b/inference/src/tests/common-impl.cpp
@@ -0,0 +1,316 @@
+
+#ifndef BERGAMOT_TESTS_COMMON_IMPL
+#error "This is an impl file and must not be included directly!"
+#endif
+
+Response Bridge<BlockingService>::translate(BlockingService &service, std::shared_ptr<TranslationModel> &model,
+                                            std::string &&source, const ResponseOptions &responseOptions) {
+  // project source to a vector of std::string, send in, unpack the first element from
+  // vector<Response>, return.
+  std::vector<std::string> sources = {source};
+  std::vector<ResponseOptions> options = {responseOptions};
+  return service.translateMultiple(model, std::move(sources), options).front();
+}
+
+Response Bridge<AsyncService>::translate(AsyncService &service, std::shared_ptr<TranslationModel> &model,
+                                         std::string &&source, const ResponseOptions &responseOptions) {
+  // downgrade to blocking via promise, future, wait and return response;
+  std::promise<Response> responsePromise;
+  std::future<Response> responseFuture = responsePromise.get_future();
+
+  auto callback = [&responsePromise](Response &&response) { responsePromise.set_value(std::move(response)); };
+  service.translate(model, std::move(source), callback, responseOptions);
+
+  responseFuture.wait();
+
+  Response response = responseFuture.get();
+  return response;
+}
+
+Response Bridge<BlockingService>::pivot(BlockingService &service, std::shared_ptr<TranslationModel> &sourceToPivot,
+                                        std::shared_ptr<TranslationModel> &pivotToTarget, std::string &&source,
+                                        const ResponseOptions &responseOptions) {
+  std::vector<std::string> sources = {source};
+  std::vector<ResponseOptions> options = {responseOptions};
+  return service.pivotMultiple(sourceToPivot, pivotToTarget, std::move(sources), options).front();
+}
+
+Response Bridge<AsyncService>::pivot(AsyncService &service, std::shared_ptr<TranslationModel> &sourceToPivot,
+                                     std::shared_ptr<TranslationModel> &pivotToTarget, std::string &&source,
+                                     const ResponseOptions &responseOptions) {
+  std::promise<Response> responsePromise;
+  std::future<Response> responseFuture = responsePromise.get_future();
+
+  auto callback = [&responsePromise](Response &&response) { responsePromise.set_value(std::move(response)); };
+  service.pivot(sourceToPivot, pivotToTarget, std::move(source), callback, responseOptions);
+  responseFuture.wait();
+  Response response = responseFuture.get();
+  return response;
+}
+
+template <class Service>
+TestSuite<Service>::TestSuite(Service &service) : service_{service} {}
+
+template <class Service>
+void TestSuite<Service>::TestSuite::run(const std::string &opModeAsString, std::vector<Ptr<TranslationModel>> &models) {
+  if (opModeAsString == "decoder") {
+    benchmarkDecoder(models.front());
+  } else if (opModeAsString == "test-response-source-sentences") {
+    annotatedTextSentences(models.front(), /*source=*/true);
+  } else if (opModeAsString == "test-response-target-sentences") {
+    annotatedTextSentences(models.front(), /*source=*/false);
+  } else if (opModeAsString == "test-response-source-words") {
+    annotatedTextWords(models.front(), /*source=*/true);
+  } else if (opModeAsString == "test-response-target-words") {
+    annotatedTextWords(models.front(), /*source=*/false);
+  } else if (opModeAsString == "test-forward-backward") {
+    forwardAndBackward(models);
+  } else if (opModeAsString == "test-quality-estimator-words") {
+    qualityEstimatorWords(models.front());
+  } else if (opModeAsString == "test-quality-estimator-scores") {
+    qualityEstimatorScores(models.front());
+  } else if (opModeAsString == "test-translation-cache") {
+    translationCache(models.front());
+  } else if (opModeAsString == "test-pivot") {
+    pivotTranslate(models);
+  } else if (opModeAsString == "test-pivot-with-html") {
+    pivotTranslateWithHTML(models);
+  } else if (opModeAsString == "test-html-translation") {
+    htmlTranslation(models.front());
+  } else {
+    std::cerr << "Incompatible test mode. Choose from the one of the valid test-modes";
+    std::abort();
+  }
+}
+
+template <class Service>
+void TestSuite<Service>::benchmarkDecoder(Ptr<TranslationModel> &model) {
+  marian::timer::Timer decoderTimer;
+  std::string source = readFromStdin();
+
+  ResponseOptions responseOptions;
+  Response response = bridge_.translate(service_, model, std::move(source), responseOptions);
+
+  for (size_t sentenceIdx = 0; sentenceIdx < response.size(); sentenceIdx++) {
+    std::cout << response.target.sentence(sentenceIdx) << "\n";
+  }
+
+  std::cerr << "Total time: " << std::setprecision(5) << decoderTimer.elapsed() << "s wall" << std::endl;
+}
+
+// Reads from stdin and translates.  Prints the tokens separated by space for each sentence. Prints words from source
+// side text annotation if source=true, target annotation otherwise.
+template <class Service>
+void TestSuite<Service>::annotatedTextWords(Ptr<TranslationModel> model, bool sourceSide /*=true*/) {
+  ResponseOptions responseOptions;
+  std::string source = readFromStdin();
+  Response response = bridge_.translate(service_, model, std::move(source), responseOptions);
+  AnnotatedText &annotatedText = sourceSide ? response.source : response.target;
+  for (size_t s = 0; s < annotatedText.numSentences(); s++) {
+    for (size_t w = 0; w < annotatedText.numWords(s); w++) {
+      std::cout << (w == 0 ? "" : "\t");
+      std::cout << annotatedText.word(s, w);
+    }
+    std::cout << "\n";
+  }
+}
+
+// Reads from stdin and translates the read content. Prints the sentences in source or target in constructed response
+// in each line, depending on source = true or false respectively.
+template <class Service>
+void TestSuite<Service>::annotatedTextSentences(Ptr<TranslationModel> model, bool sourceSide /*=true*/) {
+  ResponseOptions responseOptions;
+  std::string source = readFromStdin();
+  Response response = bridge_.translate(service_, model, std::move(source), responseOptions);
+  AnnotatedText &annotatedText = sourceSide ? response.source : response.target;
+  for (size_t s = 0; s < annotatedText.numSentences(); s++) {
+    std::cout << annotatedText.sentence(s) << "\n";
+  }
+}
+
+template <class Service>
+void TestSuite<Service>::forwardAndBackward(std::vector<Ptr<TranslationModel>> &models) {
+  ABORT_IF(models.size() != 2, "Forward and backward test needs two models.");
+  ResponseOptions responseOptions;
+  std::string source = readFromStdin();
+  Response forwardResponse = bridge_.translate(service_, models.front(), std::move(source), responseOptions);
+
+  // Make a copy of target
+  std::string target = forwardResponse.target.text;
+  Response backwardResponse = bridge_.translate(service_, models.back(), std::move(target), responseOptions);
+
+  // Print both onto the command-line
+  std::cout << forwardResponse.source.text;
+  std::cout << "----------------\n";
+  std::cout << forwardResponse.target.text;
+  std::cout << "----------------\n";
+  std::cout << backwardResponse.target.text;
+}
+
+// Reads from stdin and translates the read content. Prints the quality words for each sentence.
+template <class Service>
+void TestSuite<Service>::qualityEstimatorWords(Ptr<TranslationModel> model) {
+  ResponseOptions responseOptions;
+  responseOptions.qualityScores = true;
+  std::string source = readFromStdin();
+  const Response response = bridge_.translate(service_, model, std::move(source), responseOptions);
+
+  for (size_t sentenceIdx = 0; sentenceIdx < response.qualityScores.size(); ++sentenceIdx) {
+    const auto &sentenceQualityEstimate = response.qualityScores[sentenceIdx];
+    std::cout << "[SentenceBegin]\n";
+
+    for (const auto &wordByteRange : getWordByteRanges(response, sentenceIdx)) {
+      const string_view word(response.target.text.data() + wordByteRange.begin, wordByteRange.size());
+      std::cout << word << "\n";
+    }
+    std::cout << "[SentenceEnd]\n\n";
+  }
+}
+
+template <class Service>
+void TestSuite<Service>::htmlTranslation(Ptr<TranslationModel> model) {
+  ResponseOptions responseOptions;
+  responseOptions.HTML = true;
+  responseOptions.alignment = true;
+  std::string source = readFromStdin();
+  const Response response = bridge_.translate(service_, model, std::move(source), responseOptions);
+
+  std::cout << response.target.text;
+}
+
+// Reads from stdin and translates the read content. Prints the quality scores for each sentence.
+template <class Service>
+void TestSuite<Service>::qualityEstimatorScores(Ptr<TranslationModel> model) {
+  ResponseOptions responseOptions;
+  responseOptions.qualityScores = true;
+
+  std::string source = readFromStdin();
+  const Response response = bridge_.translate(service_, model, std::move(source), responseOptions);
+
+  for (const auto &sentenceQualityEstimate : response.qualityScores) {
+    std::cout << std::fixed << std::setprecision(3) << sentenceQualityEstimate.sentenceScore << "\n";
+
+    for (const float &wordScore : sentenceQualityEstimate.wordScores) {
+      std::cout << std::fixed << std::setprecision(3) << wordScore << "\n";
+    }
+    std::cout << "\n";
+  }
+}
+
+template <class Service>
+void TestSuite<Service>::translationCache(Ptr<TranslationModel> model) {
+  ResponseOptions responseOptions;
+
+  // Read a large input text blob from stdin
+  const std::string source = readFromStdin();
+
+  // Round 1
+  std::string buffer = source;
+  Response firstResponse = bridge_.translate(service_, model, std::move(buffer), responseOptions);
+
+  auto statsFirstRun = service_.cacheStats();
+  LOG(info, "Cache Hits/Misses = {}/{}", statsFirstRun.hits, statsFirstRun.misses);
+  ABORT_IF(statsFirstRun.hits != 0, "Expecting no cache hits, but hits found.");
+
+  // Round 2; There should be cache hits
+  buffer = source;
+  Response secondResponse = bridge_.translate(service_, model, std::move(buffer), responseOptions);
+
+  auto statsSecondRun = service_.cacheStats();
+  LOG(info, "Cache Hits/Misses = {}/{}", statsSecondRun.hits, statsSecondRun.misses);
+  ABORT_IF(statsSecondRun.hits <= 0, "At least one hit expected, none found.");
+  if (statsSecondRun.hits != statsFirstRun.misses) {
+    std::cerr << "Mismatch in expected hits (Hits, Misses = " << statsSecondRun.hits << ", " << statsSecondRun.misses
+              << "). This can happen due to random eviction." << std::endl;
+  }
+
+  ABORT_IF(firstResponse.target.text != secondResponse.target.text,
+           "Recompiled string provided different output when operated with cache. On the same hardware while using "
+           "same path, this is expected to be same.");
+
+  std::cout << firstResponse.target.text;
+}
+
+template <class Service>
+void TestSuite<Service>::pivotTranslateWithHTML(std::vector<Ptr<TranslationModel>> &models) {
+  ABORT_IF(models.size() != 2, "Forward and backward test needs two models.");
+  ResponseOptions responseOptions;
+  responseOptions.HTML = true;
+  std::string source = readFromStdin();
+  std::promise<Response> responsePromise;
+  std::future<Response> responseFuture = responsePromise.get_future();
+  Response response = bridge_.pivot(service_, models.front(), models.back(), std::move(source), responseOptions);
+  std::cout << response.source.text;
+  std::cout << response.target.text;
+}
+
+template <class Service>
+void TestSuite<Service>::pivotTranslate(std::vector<Ptr<TranslationModel>> &models) {
+  // We expect a source -> pivot; pivot -> source model to get source -> source and build this test using accuracy of
+  // matches.
+  ABORT_IF(models.size() != 2, "Forward and backward test needs two models.");
+  ResponseOptions responseOptions;
+  responseOptions.alignment = true;
+  std::string source = readFromStdin();
+  std::promise<Response> responsePromise;
+  std::future<Response> responseFuture = responsePromise.get_future();
+
+  Response response = bridge_.pivot(service_, models.front(), models.back(), std::move(source), responseOptions);
+
+  const float EPS = 1e-5;
+  size_t totalOutcomes = 0;
+  size_t favourableOutcomes = 0;
+
+  for (size_t sentenceId = 0; sentenceId < response.source.numSentences(); sentenceId++) {
+    std::cout << "> " << response.source.sentence(sentenceId) << "\n";
+    std::cout << "< " << response.target.sentence(sentenceId) << "\n\n";
+
+    // Assert what we have is a probability distribution over source-tokens given a target token.
+    for (size_t t = 0; t < response.alignments[sentenceId].size(); t++) {
+      float sum = 0.0f;
+      for (size_t s = 0; s < response.alignments[sentenceId][t].size(); s++) {
+        sum += response.alignments[sentenceId][t][s];
+      }
+
+      std::cerr << fmt::format("Sum @ (target-token = {}, sentence = {}) = {}", t, sentenceId, sum) << std::endl;
+      ABORT_IF((std::abs(sum - 1.0f) > EPS), "Not a probability distribution, something's going wrong");
+    }
+
+    // For each target token, find argmax s, i.e find argmax p(s | t), max p(s | t)
+    for (size_t t = 0; t < response.alignments[sentenceId].size(); t++) {
+      bool valid = false;
+      float maxV = 0.0f;
+      auto argmaxV = std::make_pair(-1, -1);
+      for (size_t s = 0; s < response.alignments[sentenceId][t].size(); s++) {
+        auto v = response.alignments[sentenceId][t][s];
+        if (v > maxV) {
+          maxV = v;
+          argmaxV = std::make_pair(t, s);
+        }
+      }
+
+      auto sourceToken = response.source.word(sentenceId, argmaxV.second);
+      auto targetToken = response.target.word(sentenceId, argmaxV.first);
+      if (sourceToken == targetToken) {
+        favourableOutcomes += 1;
+      }
+
+      std::cerr << sourceToken << " " << targetToken << " " << maxV << std::endl;
+
+      totalOutcomes += 1;
+    }
+
+    // Assert each alignment over target is a valid probability distribution.
+  }
+
+  // Measure accuracy of word match.
+  float accuracy = static_cast<float>(favourableOutcomes) / static_cast<float>(totalOutcomes);
+
+  // This is arbitrary value chosen by @jerinphilip, but should be enough to check if things fail.
+  // This value is calibrated on bergamot input in BRT. All this is supposed to do is let the developers know if
+  // something's largely amiss to the point alignments are not working.
+  ABORT_IF(accuracy < 0.70, "Accuracy {} not enough. Please check if something's off.", accuracy * 100);
+
+  std::cout << response.source.text;
+  std::cout << response.target.text;
+}
diff --git a/inference/src/tests/common.h b/inference/src/tests/common.h
new file mode 100644
index 000000000..238a62357
--- /dev/null
+++ b/inference/src/tests/common.h
@@ -0,0 +1,100 @@
+#pragma once
+#include <algorithm>
+#include <cstdlib>
+#include <future>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "common/definitions.h"
+#include "common/timer.h"
+#include "common/utils.h"
+#include "marian.h"
+#include "translator/byte_array_util.h"
+#include "translator/parser.h"
+#include "translator/response.h"
+#include "translator/response_options.h"
+#include "translator/service.h"
+#include "translator/utils.h"
+
+namespace marian::bergamot {
+
+/// Due to the stubborn-ness of the extension and native to not agree on API (e.g, translateMultiple vs translate),
+/// different underlying cache we have the following "bridge" at test-applications - taking into account the fact that
+/// the most commonly used primitives across both Services is a single text blob in and corresponding Response out, in a
+/// blocking fashion.
+///
+/// The following contraption constrains a single sentence to single Response parameterized by Service, in a test-suite
+/// below. This allows sharing of code for test-suite between WebAssembly's workflows and Native's workflows.
+///
+/// The intention here is to use templating to achieve the same thing an ifdef would have at compile-time. Also mandates
+/// after bridge layer, both WebAssembly and Native paths compile correctly (this does not guarantee outputs are the
+/// same through both code-paths, or that both are tested at runtime - only that both compile and work with a bridge).
+///
+/// For any complex workflows involving non-blocking concurrent translation, it is required to write something not
+/// constrained by the following.
+
+template <class Service>
+struct Bridge : public std::false_type {};
+
+template <>
+struct Bridge<BlockingService> : public std::true_type {
+  Response translate(BlockingService &service, std::shared_ptr<TranslationModel> &model, std::string &&source,
+                     const ResponseOptions &responseOptions);
+  Response pivot(BlockingService &service, std::shared_ptr<TranslationModel> &sourceToPivot,
+                 std::shared_ptr<TranslationModel> &pivotToTarget, std::string &&source,
+                 const ResponseOptions &responseOptions);
+};
+
+template <>
+struct Bridge<AsyncService> : public std::true_type {
+  Response translate(AsyncService &service, std::shared_ptr<TranslationModel> &model, std::string &&source,
+                     const ResponseOptions &responseOptions);
+  Response pivot(AsyncService &service, std::shared_ptr<TranslationModel> &sourceToPivot,
+                 std::shared_ptr<TranslationModel> &pivotToTarget, std::string &&source,
+                 const ResponseOptions &responseOptions);
+};
+
+template <class Service>
+class TestSuite {
+ private:
+  Bridge<Service> bridge_;
+  Service &service_;
+
+ public:
+  TestSuite(Service &service);
+  void run(const std::string &opModeAsString, std::vector<Ptr<TranslationModel>> &models);
+
+ private:
+  void benchmarkDecoder(Ptr<TranslationModel> &model);
+
+  // Reads from stdin and translates.  Prints the tokens separated by space for each sentence. Prints words from source
+  // side text annotation if source=true, target annotation otherwise.
+  void annotatedTextWords(Ptr<TranslationModel> model, bool sourceSide = true);
+
+  // Reads from stdin and translates the read content. Prints the sentences in source or target in constructed response
+  // in each line, depending on source = true or false respectively.
+  void annotatedTextSentences(Ptr<TranslationModel> model, bool sourceSide = true);
+
+  void forwardAndBackward(std::vector<Ptr<TranslationModel>> &models);
+
+  // Reads from stdin and translates the read content. Prints the quality words for each sentence.
+  void qualityEstimatorWords(Ptr<TranslationModel> model);
+
+  // Reads from stdin and translates the read content. Prints the quality scores for each sentence.
+  void qualityEstimatorScores(Ptr<TranslationModel> model);
+
+  void translationCache(Ptr<TranslationModel> model);
+
+  void pivotTranslate(std::vector<Ptr<TranslationModel>> &models);
+
+  void pivotTranslateWithHTML(std::vector<Ptr<TranslationModel>> &models);
+
+  void htmlTranslation(Ptr<TranslationModel> model);
+};
+
+#define BERGAMOT_TESTS_COMMON_IMPL
+#include "common-impl.cpp"
+#undef BERGAMOT_TESTS_COMMON_IMPL
+
+}  // namespace marian::bergamot
diff --git a/inference/src/tests/intgemm-resolve.cpp b/inference/src/tests/intgemm-resolve.cpp
new file mode 100644
index 000000000..f95d0c449
--- /dev/null
+++ b/inference/src/tests/intgemm-resolve.cpp
@@ -0,0 +1,8 @@
+#include <iostream>
+
+#include "intgemm/intgemm.h"
+
+int main() {
+  std::cout << static_cast<int>(intgemm::kCPU) << "\n";
+  return 0;
+}
diff --git a/inference/src/tests/units/CMakeLists.txt b/inference/src/tests/units/CMakeLists.txt
new file mode 100644
index 000000000..9cfb50006
--- /dev/null
+++ b/inference/src/tests/units/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Unit tests
+set(UNIT_TESTS
+    annotation_tests
+    cache_tests
+    quality_estimator_tests
+    html_tests
+    xh_scanner_tests)
+
+foreach(test ${UNIT_TESTS})
+  add_executable("run_${test}" run_tests.cpp "${test}.cpp")
+  target_include_directories("run_${test}" PRIVATE ${CATCH_INCLUDE_DIR} "${CMAKE_SOURCE_DIR}/src")
+
+  if(CUDA_FOUND)
+    target_link_libraries("run_${test}" ${EXT_LIBS} marian ${EXT_LIBS} marian_cuda ${EXT_LIBS} Catch bergamot-translator)
+  else(CUDA_FOUND)
+    target_link_libraries("run_${test}" marian ${EXT_LIBS} Catch bergamot-translator)
+  endif(CUDA_FOUND)
+
+  if(msvc)
+    # disable c4305: truncation from 'double' to '_ty'
+    target_compile_options("run_${test}" public /wd4305)
+  endif(msvc)
+
+  add_test(NAME ${test} COMMAND "run_${test}")
+endforeach(test)
diff --git a/inference/src/tests/units/annotation_tests.cpp b/inference/src/tests/units/annotation_tests.cpp
new file mode 100644
index 000000000..d7178f4df
--- /dev/null
+++ b/inference/src/tests/units/annotation_tests.cpp
@@ -0,0 +1,214 @@
+#include <random>
+#include <vector>
+
+#include "catch.hpp"
+#include "translator/annotation.h"
+
+using namespace marian::bergamot;
+
+TEST_CASE("Test Annotation API with random sentences") {
+  /// Objective here is to test insertion for sentences, and that whatever comes
+  /// out adheres to the way it was inserted. Towards this, we keep externally
+  /// which sentence went in where and try to use accessor methods on
+  /// AnnotatedText to check if what we have as ground-truth by construction is
+  /// consistent with what is returned.
+  size_t sentences = 500;
+  size_t maxWords = 40;
+
+  // Set in case needed to see output. The output is in lines of #sentences +
+  // header, which can be split and compared for easy understanding. The ideal
+  // way to inspect what is going wrong is to redirect output and use to split
+  // the different stages by sentences + 1 lines and check the diff.
+  bool debug{false};
+
+  std::mt19937 randomIntGen_;
+  randomIntGen_.seed(42);
+
+  // External book-keeping so we have ground truths. Each element represents a
+  // sentence.
+
+  // word byte ranges - for testAnnotation.word(sId, wId)
+  std::vector<std::vector<ByteRange>> groundTruthWords;
+  // sentence byte ranges - for testAnnotation.sentence(sId, wId)
+  std::vector<ByteRange> groundTruthSentences;
+
+  // Prepare the text and construct ByteRanges as intended for sentences and
+  // words. The ByteRanges we construct here are expected to be the
+  // ground-truths for words and sentences. The string being constructed is like
+  // as follows:
+  //
+  //     0-0 0-1 0-2 0-3
+  //     1-0 1-1 1-2 1-3 1-4
+  //     2-0 2-1
+  //
+  //     4-0 4-1 4-2 4-3
+  //
+  // Tokens are contiguous because that's how SentencePiece works.
+  //
+  // Below, we accumulate the text with intended structure as above, and
+  // ground-truth tables populated to be aware of the ByteRanges where they are
+  // meant to be.
+  if (debug) {
+    std::cout << "Preparing text and ground truth-tables" << std::endl;
+  }
+  std::string text;
+  for (size_t idx = 0; idx < sentences; idx++) {
+    if (idx != 0) text += "\n";
+
+    // Words can be zero, we need to support empty word sentences as well.
+    size_t numWords = randomIntGen_() % maxWords;
+
+    std::vector<ByteRange> wordByteRanges;
+    wordByteRanges.reserve(numWords);
+
+    // For empty sentence, we expect it to be empty and marked in position where
+    // the existing string is if needed to be pointed out.
+    size_t before = text.size() - 1;
+    size_t sentenceBegin{before}, sentenceEnd{before};
+
+    for (size_t idw = 0; idw < numWords; idw++) {
+      // Get new beginning, accounting for space above.
+      before = text.size();
+
+      // Add the word
+      std::string word = std::to_string(idx) + "-" + std::to_string(idw);
+      text += word;
+
+      // Do math, before, before + new-word's size.
+      wordByteRanges.push_back((ByteRange){before, before + word.size()});
+
+      if (debug) {
+        std::cout << word;
+      }
+
+      if (idw == 0) {
+        sentenceBegin = before;
+      }
+      if (idw == numWords - 1) {
+        sentenceEnd = before + word.size();
+      }
+    }
+    if (debug) {
+      std::cout << std::endl;
+    }
+
+    groundTruthWords.push_back(wordByteRanges);
+    groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
+  }
+
+  AnnotatedText testAnnotation(std::move(text));  // This the container we add through API and
+                                                  // check if the access is correct.
+
+  // We prepare string_views now with the known ByteRanges and use the
+  // string_view based AnnotatedText.addSentence(...) API to add sentences to
+  // transparently convert from string_views to ByteRanges, rebasing/working out
+  // the math underneath.
+
+  if (debug) {
+    std::cout << "Inserting words onto container and save ground-truth-table:" << std::endl;
+  }
+
+  std::vector<std::vector<marian::string_view>> wordStringViews;
+  std::vector<ByteRange>::const_iterator sentence_iter = groundTruthSentences.begin();
+  for (auto &sentence : groundTruthWords) {
+    std::vector<marian::string_view> wordByteRanges;
+    bool first{true};
+    for (auto &word : sentence) {
+      marian::string_view wordView(&testAnnotation.text[word.begin], word.size());
+      wordByteRanges.push_back(wordView);
+      if (debug) {
+        if (first) {
+          first = false;
+        } else {
+          std::cout << " ";
+        }
+        std::cout << std::string(wordView);
+      }
+    }
+    testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(),
+                                          testAnnotation.text.data() + sentence_iter->begin);
+    ++sentence_iter;
+    wordStringViews.push_back(wordByteRanges);
+    if (debug) {
+      std::cout << std::endl;
+    }
+  }
+
+  if (debug) {
+    std::cout << "Inserting sentences onto container and save ground-truth-table" << std::endl;
+  }
+  std::vector<marian::string_view> sentenceStringViews;
+  for (auto &sentenceByteRange : groundTruthSentences) {
+    char *data = &(testAnnotation.text[sentenceByteRange.begin]);
+    marian::string_view sentenceView(data, sentenceByteRange.size());
+    sentenceStringViews.push_back(sentenceView);
+
+    if (debug) {
+      std::cout << sentenceView << std::endl;
+    }
+  }
+
+  // Access from the sentence(sentenceIdx) API and confirm that the ground truth
+  // we expect is same as what comes out of the container.
+  if (debug) {
+    std::cout << "From container: Sentences" << std::endl;
+  }
+  for (int idx = 0; idx < groundTruthSentences.size(); idx++) {
+    ByteRange expected = groundTruthSentences[idx];
+    ByteRange obtained = testAnnotation.sentenceAsByteRange(idx);
+    if (debug) {
+      std::cout << std::string(testAnnotation.sentence(idx)) << std::endl;
+    }
+    CHECK(expected.begin == obtained.begin);
+    CHECK(expected.end == obtained.end);
+    std::string expected_string = std::string(sentenceStringViews[idx]);
+    std::string obtained_string = std::string(testAnnotation.sentence(idx));
+    CHECK(expected_string == obtained_string);
+  }
+
+  /// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as
+  /// expected words are the same as those obtained from the container.
+  if (debug) {
+    std::cout << "From container: Words" << std::endl;
+  }
+
+  CHECK(groundTruthWords.size() == testAnnotation.numSentences());
+  for (int idx = 0; idx < groundTruthWords.size(); idx++) {
+    CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx));
+  }
+
+  for (int idx = 0; idx < groundTruthWords.size(); idx++) {
+    for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) {
+      ByteRange expected = groundTruthWords[idx][idw];
+      ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw);
+      if (debug) {
+        std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
+      }
+      CHECK(expected.begin == obtained.begin);
+      CHECK(expected.end == obtained.end);
+
+      std::string expected_string = std::string(wordStringViews[idx][idw]);
+      std::string obtained_string = std::string(testAnnotation.word(idx, idw));
+      CHECK(expected_string == obtained_string);
+    }
+    if (debug) {
+      std::cout << std::endl;
+    }
+  }
+
+  // Try inserting an empty Sentence. This is ensuring we check for empty
+  // Sentence if the random test above does not cover it for some reason.
+  int emptySentenceIdx = sentences;
+  std::vector<marian::string_view> emptySentence;
+  testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(),
+                                        testAnnotation.text.data() + testAnnotation.text.size());
+
+  // There are no words.
+  CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
+
+  // Empty sentence expected at output.
+  std::string expectedEmptyString = "";
+  marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx);
+  std::string obtainedString = std::string(emptyView.data(), emptyView.size());
+  CHECK(expectedEmptyString == obtainedString);
+}
diff --git a/inference/src/tests/units/cache_tests.cpp b/inference/src/tests/units/cache_tests.cpp
new file mode 100644
index 000000000..f2f1b19ed
--- /dev/null
+++ b/inference/src/tests/units/cache_tests.cpp
@@ -0,0 +1,56 @@
+
+#include <random>
+#include <thread>
+
+#include "catch.hpp"
+#include "translator/cache.h"
+#include "translator/history.h"
+
+using namespace marian::bergamot;
+
+TEST_CASE("Test Cache in a threaded setting") {
+  size_t numThreads = 100;
+  size_t numIters = 10000;
+  using Key = int;
+  using Value = int;
+  using TestCache = AtomicCache<Key, Value>;
+
+  TestCache cache(/*size=*/300, /*mutexBuckets=*/16);
+
+  auto op = [numIters, &cache]() {
+    std::mt19937_64 randomGenerator;
+    randomGenerator.seed(42);  // reproducible outputs
+    Value randMax = 2000;
+
+    for (size_t i = 0; i < numIters; i++) {
+      Key query = randomGenerator() % randMax;
+      std::pair<bool, Value> result = cache.find(query);
+      if (result.first) {
+        REQUIRE(result.second == query);
+      }
+
+      Value value = query;
+      cache.store(/*key=*/query, std::move(value));
+    }
+  };
+
+  std::vector<std::thread> workers;
+  for (size_t t = 0; t < numThreads; t++) {
+    workers.emplace_back(op);
+  }
+
+  for (size_t t = 0; t < numThreads; t++) {
+    workers[t].join();
+  }
+
+  TestCache::Stats stats = cache.stats();
+  float hitRate = static_cast<float>(stats.hits) / static_cast<float>(stats.hits + stats.misses);
+
+  // This is non-deterministic due to threads.
+  std::cout << "Hit-Rate:" << hitRate << "\n";
+  std::cout << "(Hits, Misses) = " << stats.hits << " " << stats.misses << "\n";
+
+  // Can we create a specialization of the actual cache-type we want? Does it compile, at least?
+  // We already have Ptr<History>, it's easier to move Ptr<History> to cache.
+  TranslationCache translationCache(/*size=*/300, /*mutexBuckets=*/16);
+}
diff --git a/inference/src/tests/units/html_tests.cpp b/inference/src/tests/units/html_tests.cpp
new file mode 100644
index 000000000..96eff5aad
--- /dev/null
+++ b/inference/src/tests/units/html_tests.cpp
@@ -0,0 +1,880 @@
+#include "html_tests.h"
+
+#include <vector>
+
+#include "catch.hpp"
+#include "data/types.h"  // for marian::string_view
+#include "translator/html.h"
+#include "translator/response.h"
+
+using namespace marian::bergamot;
+using marian::string_view;
+
+class MarianThrowsExceptionsFixture {
+ protected:
+  MarianThrowsExceptionsFixture() : prev_(marian::getThrowExceptionOnAbort()) {
+    marian::setThrowExceptionOnAbort(true);
+  }
+  ~MarianThrowsExceptionsFixture() { marian::setThrowExceptionOnAbort(prev_); }
+
+ private:
+  bool prev_;
+};
+
+std::ostream &operator<<(std::ostream &out, std::pair<ByteRange, ByteRange> const &b) {
+  return out << '(' << b.first << ',' << b.second << ')';
+}
+
+std::ostream &operator<<(std::ostream &out, ByteRange const &b) { return out << '{' << b.begin << ',' << b.end << '}'; }
+
+std::vector<ByteRange> asByteRanges(AnnotatedText const &annotation) {
+  std::vector<ByteRange> words;
+  words.emplace_back(annotation.annotation.gap(0));
+  for (size_t sentenceIdx = 0; sentenceIdx < annotation.numSentences(); ++sentenceIdx) {
+    for (size_t wordIdx = 0; wordIdx < annotation.numWords(sentenceIdx); ++wordIdx)
+      words.emplace_back(annotation.wordAsByteRange(sentenceIdx, wordIdx));
+    words.emplace_back(annotation.annotation.gap(sentenceIdx + 1));
+  }
+  return words;
+}
+
+std::vector<std::string> asTokens(AnnotatedText const &annotation) {
+  std::vector<std::string> words;
+  words.emplace_back(annotation.gap(0));
+  for (size_t sentenceIdx = 0; sentenceIdx < annotation.numSentences(); ++sentenceIdx) {
+    for (size_t wordIdx = 0; wordIdx < annotation.numWords(sentenceIdx); ++wordIdx)
+      words.emplace_back(annotation.word(sentenceIdx, wordIdx));
+    words.emplace_back(annotation.gap(sentenceIdx + 1));
+  }
+  return words;
+}
+
+void recordSentenceFromByteRange(AnnotatedText &text, std::vector<ByteRange> const &ranges) {
+  assert(ranges.size() > 0);
+
+  std::vector<string_view> tokens;
+  tokens.reserve(ranges.size());
+
+  for (auto &&range : ranges) tokens.emplace_back(text.text.data() + range.begin, range.size());
+
+  text.recordExistingSentence(tokens.begin(), tokens.end(), text.text.data() + ranges[0].begin);
+}
+
+template <typename T>
+std::vector<std::vector<T>> identity_matrix(size_t size) {
+  std::vector<std::vector<T>> rows(size);
+  for (size_t row = 0; row < size; ++row) {
+    rows[row].resize(size, T(0));
+    rows[row][row] = T(1);
+  }
+  return rows;
+}
+
+TEST_CASE("Ignore HTML if process_markup is false") {
+  std::string html_code("<p>This text &amp; has <b>HTML</b> in it</p>");
+
+  std::string input(html_code);
+  HTML html(std::move(input), false);
+  CHECK(input == html_code);
+
+  Response response;
+  response.source.text = html_code;
+  response.target.text = html_code;
+  // Note: response.alignments is empty, which is allowed in this case
+  html.restore(response);
+
+  // Assert that restore() does not mess with my HTML code
+  CHECK(response.source.text == html_code);
+}
+
+TEST_CASE_METHOD(MarianThrowsExceptionsFixture, "Abort if alignments are missing") {
+  std::string input("<p>hello <b>world</b></p>\n");
+  HTML html(std::move(input), true);
+
+  AnnotatedText source("hello world\n");
+  recordSentenceFromByteRange(source, {
+                                          ByteRange{0, 4},   // 0.0 "hell"
+                                          ByteRange{4, 5},   // 0.1 "o"
+                                          ByteRange{5, 11},  // 0.2 " world"
+                                          ByteRange{11, 11}  // 0.3 ""
+                                      });
+
+  AnnotatedText target("hallo Welt\n");
+  recordSentenceFromByteRange(target, {
+                                          ByteRange{0, 4},   // 0.0 "hall"
+                                          ByteRange{4, 5},   // 0.1 "o"
+                                          ByteRange{5, 10},  // 0.2 " Welt"
+                                          ByteRange{10, 10}  // 0.3 ""
+                                      });
+
+  Response response;
+  response.source = source;
+  response.target = target;
+  // Note: explicitly not setting response.alignments
+
+  CHECK_THROWS_WITH(
+      html.restore(response),
+      "Response object does not contain alignments. TranslationModel or ResponseOptions is misconfigured?");
+}
+
+TEST_CASE_METHOD(MarianThrowsExceptionsFixture, "Abort if alignments are misconfigured") {
+  std::string input("<p>hello <b>world</b></p>\n");
+  HTML html(std::move(input), true);
+
+  AnnotatedText source("hello world\n");
+  recordSentenceFromByteRange(source, {
+                                          ByteRange{0, 4},   // 0.0 "hell"
+                                          ByteRange{4, 5},   // 0.1 "o"
+                                          ByteRange{5, 11},  // 0.2 " world"
+                                          ByteRange{11, 11}  // 0.3 ""
+                                      });
+
+  AnnotatedText target("hallo Welt\n");
+  recordSentenceFromByteRange(target, {
+                                          ByteRange{0, 4},   // 0.0 "hall"
+                                          ByteRange{4, 5},   // 0.1 "o"
+                                          ByteRange{5, 10},  // 0.2 " Welt"
+                                          ByteRange{10, 10}  // 0.3 ""
+                                      });
+
+  Response response;
+  response.source = source;
+  response.target = target;
+
+  // If the model is misconfigured to not give any alignment information,
+  // response will have entries for each target word, but they will all be empty.
+  response.alignments = {{{}, {}, {}, {}}};
+
+  CHECK_THROWS_WITH(
+      html.restore(response),
+      "Response object does not contain alignments. TranslationModel or ResponseOptions is misconfigured?");
+}
+
+TEST_CASE("Do not abort if the input is just empty") {
+  std::string input("");
+  HTML html(std::move(input), true);
+  CHECK(input == "");
+
+  Response response;
+  html.restore(response);
+  CHECK(response.source.text == "");
+  CHECK(response.target.text == "");
+}
+
+TEST_CASE("Do not abort if the input is just empty element") {
+  std::string input("<p></p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "");
+
+  Response response;
+  html.restore(response);
+  CHECK(response.source.text == "<p></p>");
+  CHECK(response.target.text == "<p></p>");
+}
+
+TEST_CASE("Tag names are case insensitive") {
+  // Tests <P> vs </p> and <BR> should be recognized as a void tag <br>.
+  // <B> should be recognized as inline.
+  std::string test_str("<P><B>Spa</B>ce<BR>please?</p>");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "Spa ce\n\nplease?");
+}
+
+TEST_CASE("Test case html entities") {
+  // These are all entities I would expect in innerHTML, since all other entities
+  // can be encoded as UTF-8 so there's no need to encode them through &...; when
+  // innerHTML encodes the DOM as HTML.
+  std::string input("<p data-attr=\"&quot;&apos;\">This is a sentence &lt;with&gt; named &amp; entities</p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "This is a sentence <with> named & entities");
+}
+
+TEST_CASE("Test self-closing tags should be treated as paragraph break") {
+  std::string test_str("<p>Space<br>please?</p>");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "Space\n\nplease?");
+
+  Response response;
+  std::string source_str("Space\n\nplease?");
+  std::vector<string_view> source_tokens{
+      string_view(source_str.data() + 0, 5),   // Space
+      string_view(source_str.data() + 5, 0),   // [EOS]
+      string_view(source_str.data() + 5, 2),   // \n\n
+      string_view(source_str.data() + 7, 1),   // p
+      string_view(source_str.data() + 8, 5),   // lease
+      string_view(source_str.data() + 13, 1),  // ?
+      string_view(source_str.data() + 14, 0),  // EOS
+  };
+  response.source.appendSentence("", source_tokens.begin(), source_tokens.begin() + 2);
+  response.source.appendSentence("\n\n", source_tokens.begin() + 3, source_tokens.end());
+
+  std::string target_str("Platz\n\nbitte?");
+  std::vector<string_view> target_tokens{
+      string_view(target_str.data() + 0, 5),   // Platz
+      string_view(target_str.data() + 5, 0),   // [EOS]
+      string_view(target_str.data() + 5, 2),   // \n\n
+      string_view(target_str.data() + 7, 5),   // bitte
+      string_view(target_str.data() + 12, 1),  // ?
+      string_view(target_str.data() + 13, 0),  // [EOS]
+  };
+  response.target.appendSentence("", target_tokens.begin(), target_tokens.begin() + 2);
+  response.target.appendSentence("", target_tokens.begin() + 3, target_tokens.end());
+  response.alignments = {{
+                             {1.0, 0.0},  //  Platz <- Space
+                             {0.0, 1.0}   //  [EOS] <- [EOS]
+                         },
+                         {
+                             {0.1, 0.9, 0.0, 0.0},  // _bitte <- _p + lease
+                             {0.0, 0.0, 1.0, 0.0},  //      ? <- ?
+                             {0.0, 0.0, 0.0, 1.0},  //  [EOS] <- [EOS]
+                         }};
+
+  // Main focus of this test is that the space that was introduced in the text
+  // that was being translated does not end up in the translation.
+  html.restore(response);
+  CHECK(response.source.text == "<p>Space<br>please?</p>");
+  CHECK(response.target.text == "<p>Platz<br>bitte?</p>");
+}
+
+TEST_CASE("Test inline tags should be treated as spaces") {
+  std::string test_str("un<u>der</u>line");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "un der line");
+
+  Response response;
+  std::string source_str("un der line");
+  std::vector<string_view> source_tokens{
+      string_view(source_str.data() + 0, 2),   // un
+      string_view(source_str.data() + 2, 3),   // _de
+      string_view(source_str.data() + 5, 1),   // r
+      string_view(source_str.data() + 6, 5),   // _line
+      string_view(source_str.data() + 11, 0),  // EOS
+  };
+  response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
+
+  std::string target_str("una linea der");
+  std::vector<string_view> target_tokens{
+      string_view(target_str.data() + 0, 3),   // una
+      string_view(target_str.data() + 3, 6),   // _linéa
+      string_view(target_str.data() + 9, 3),   // _de
+      string_view(target_str.data() + 12, 1),  // r
+      string_view(target_str.data() + 13, 0),  // [EOS]
+  };
+  response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
+
+  response.alignments = {{{0.9795, 0.0127, 0.0002, 0.0066, 0.0009},
+                          {0.0098, 0.2967, 0.0156, 0.6640, 0.0138},
+                          {0.0214, 0.7472, 0.0626, 0.0745, 0.0943},
+                          {0.0022, 0.0230, 0.9357, 0.0165, 0.0226},
+                          {0.0122, 0.0240, 0.0085, 0.7427, 0.2125}}};
+
+  html.restore(response);
+  CHECK(response.source.text == "un <u>der</u> line");  // TODO leave spaces?
+  CHECK(response.target.text == "una linea <u>der</u>");
+}
+
+TEST_CASE("Test inline tags should not break words") {
+  std::string test_str("un<u>der</u>line");
+
+  std::string input(test_str);
+  HTML::Options options;
+  options.substituteInlineTagsWithSpaces = false;
+  HTML html(std::move(input), true, std::move(options));
+  CHECK(input == "underline");
+
+  Response response;
+  std::string source_str("underline");
+  std::vector<string_view> source_tokens{
+      string_view(source_str.data() + 0, 9),  // underline
+      string_view(source_str.data() + 9, 0),  // EOS
+  };
+  response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
+
+  std::string target_str("subrayar");
+  std::vector<string_view> target_tokens{
+      string_view(target_str.data() + 0, 8),  // subrayar
+      string_view(target_str.data() + 8, 0),  // [EOS]
+  };
+  response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
+
+  response.alignments = {identity_matrix<float>(2)};
+
+  html.restore(response);
+  CHECK(response.source.text == "<u></u>underline");  // TODO not spread <u> to whole word?
+  CHECK(response.target.text == "<u></u>subrayar");   // TODO not spread <u> to the whole word?
+}
+
+TEST_CASE("Test reconstruction of target sentence") {
+  std::string input("<p>hello <b>world</b></p>\n");
+  HTML html(std::move(input), true);
+  CHECK(input == "hello world\n\n\n");  // tripple \n because \n + </p>
+
+  AnnotatedText source("hello world\n\n\n");
+  recordSentenceFromByteRange(source, {
+                                          ByteRange{0, 4},   // 0.0 "hell"
+                                          ByteRange{4, 5},   // 0.1 "o"
+                                          ByteRange{5, 11},  // 0.2 " world"
+                                          ByteRange{11, 11}  // 0.3 ""
+                                      });
+
+  AnnotatedText target("hallo Welt\n\n\n");
+  recordSentenceFromByteRange(target, {
+                                          ByteRange{0, 4},   // 0.0 "hall"
+                                          ByteRange{4, 5},   // 0.1 "o"
+                                          ByteRange{5, 10},  // 0.2 " Welt"
+                                          ByteRange{10, 10}  // 0.3 ""
+                                      });
+
+  Response response;
+  response.source = source;
+  response.target = target;
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+
+  std::vector<std::string> html_tokens_source{"", "<p>hell", "o", " <b>world", "", "</b></p>\n"};
+
+  std::vector<std::string> html_tokens_target{"", "<p>hall", "o", " <b>Welt", "", "</b></p>\n"};
+
+  CHECK(asTokens(response.source) == html_tokens_source);
+  CHECK(asTokens(response.target) == html_tokens_target);
+}
+
+TEST_CASE("Test reconstruction of target sentence with entities") {
+  std::string input("<p>hello <b>world &amp; friends!</b></p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "hello world & friends!");
+
+  AnnotatedText source("hello world & friends!");
+  recordSentenceFromByteRange(source, {
+                                          ByteRange{0, 4},    // 0.0 "hell"
+                                          ByteRange{4, 5},    // 0.1 "o"
+                                          ByteRange{5, 11},   // 0.2 " world"
+                                          ByteRange{11, 13},  // 0.3 " &"
+                                          ByteRange{13, 21},  // 0.4 " friends"
+                                          ByteRange{21, 22},  // 0.5 "!"
+                                          ByteRange{22, 22}   // 0.6 ""
+                                      });
+
+  AnnotatedText target("hallo Welt & Freunde!");
+  recordSentenceFromByteRange(target, {
+                                          ByteRange{0, 4},    // 0.0 "hall"
+                                          ByteRange{4, 5},    // 0.1 "o"
+                                          ByteRange{5, 10},   // 0.2 " Welt"
+                                          ByteRange{10, 12},  // 0.3 " &"
+                                          ByteRange{12, 20},  // 0.4 " Freunde"
+                                          ByteRange{20, 21},  // 0.5 "!"
+                                          ByteRange{21, 21}   // 0.6 ""
+                                      });
+
+  Response response;
+  response.source = source;
+  response.target = target;
+  response.alignments = {identity_matrix<float>(7)};
+
+  html.restore(response);
+
+  std::vector<std::string> html_tokens_source{"",         "<p>hell", "o", " <b>world", " &amp;",
+                                              " friends", "!",       "",  "</b></p>"};
+
+  std::vector<std::string> html_tokens_target{"",         "<p>hall", "o", " <b>Welt", " &amp;",
+
+                                              " Freunde", "!",       "",  "</b></p>"};
+
+  CHECK(asTokens(response.source) == html_tokens_source);
+  CHECK(asTokens(response.target) == html_tokens_target);
+}
+
+TEST_CASE("Test reconstruction of target with multiple sentences") {
+  std::string input(
+      "<p>hello <b>world!</b> How does this <img> <b>deal <u>with multiple sentences?</u></b> Will it work?</p>");
+  HTML html(std::move(input), true);
+
+  AnnotatedText source("hello world! How does this  deal with multiple sentences? Will it work?");
+  CHECK(source.text == input);
+
+  recordSentenceFromByteRange(source, {
+                                          ByteRange{0, 4},    // 0.0 "hell"
+                                          ByteRange{4, 5},    // 0.1 "o"
+                                          ByteRange{5, 11},   // 0.2 " world"
+                                          ByteRange{11, 12},  // 0.3 "!"
+                                          ByteRange{12, 12}   // 0.4 ""
+                                      });
+  recordSentenceFromByteRange(source, {
+                                          ByteRange{13, 16},  // 1.0 "How"
+                                          ByteRange{16, 21},  // 1.1 " does"
+                                          ByteRange{21, 26},  // 1.2 " this"
+                                          ByteRange{26, 32},  // 1.3 "  deal"
+                                          ByteRange{32, 37},  // 1.4 " with"
+                                          ByteRange{37, 46},  // 1.5 " multiple"
+                                          ByteRange{46, 55},  // 1.6 " sentence"
+                                          ByteRange{55, 56},  // 1.7 "s"
+                                          ByteRange{56, 57},  // 1.8 "?"
+                                          ByteRange{57, 57}   // 1.9 ""
+                                      });
+  recordSentenceFromByteRange(source, {
+                                          ByteRange{58, 62},  // 2.0 "Will"
+                                          ByteRange{62, 65},  // 2.1 " it"
+                                          ByteRange{65, 70},  // 2.2 " work"
+                                          ByteRange{70, 71},  // 2.3 "?"
+                                          ByteRange{71, 71}   // 2.4 ""
+                                      });
+
+  AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?");
+  recordSentenceFromByteRange(target, {
+                                          ByteRange{0, 4},    // 0.0 "hall"
+                                          ByteRange{4, 5},    // 0.1 "o"
+                                          ByteRange{5, 10},   // 0.2 " Welt"
+                                          ByteRange{10, 11},  // 0.3 "!"
+                                          ByteRange{11, 11},  // 0.4 ""
+                                      });
+  recordSentenceFromByteRange(target, {
+                                          ByteRange{12, 15},  // 1.0 "Wie"
+                                          ByteRange{15, 20},  // 1.1 " geht"
+                                          ByteRange{20, 24},  // 1.2 " das"
+                                          ByteRange{24, 28},  // 1.3 " mit"
+                                          ByteRange{28, 37},  // 1.4 " mehreren"
+                                          ByteRange{37, 44},  // 1.5 " Sätze"
+                                          ByteRange{44, 45},  // 1.6 "n"
+                                          ByteRange{45, 48},  // 1.7 " um"
+                                          ByteRange{48, 49},  // 1.8 "?"
+                                          ByteRange{49, 49},  // 1.9 ""
+                                      });
+  recordSentenceFromByteRange(target, {
+                                          ByteRange{50, 54},  // 2.0 "Wird"
+                                          ByteRange{54, 57},  // 2.1 " es"
+                                          ByteRange{57, 71},  // 2.2 " funktionieren"
+                                          ByteRange{71, 72},  // 2.3 "?"
+                                          ByteRange{72, 72},  // 2.4 ""
+                                      });
+
+  std::vector<std::string> text_tokens_source{
+      "",       "hall", "o",   " Welt", "!", "",  " ",    "Wie", " geht",          " das", " mit", " mehreren",
+      " Sätze", "n",    " um", "?",     "",  " ", "Wird", " es", " funktionieren", "?",    "",     ""};
+
+  CHECK(asTokens(target) == text_tokens_source);
+
+  Response response;
+  response.source = source;
+  response.target = target;
+  response.alignments = {identity_matrix<float>(5), identity_matrix<float>(10), identity_matrix<float>(5)};
+  html.restore(response);
+
+  std::vector<std::string> html_tokens_source{"",
+                                              "<p>hell",
+                                              "o",
+                                              " <b>world",
+                                              "!",
+                                              "",
+                                              "</b> ",
+                                              "How",
+                                              " does",
+                                              " this",
+                                              "  <img><b>deal",  // note how both spaces moved to __deal
+                                              " <u>with",
+                                              " multiple",
+                                              " sentence",
+                                              "s",
+                                              "?",
+                                              "",
+                                              "</u></b> ",
+                                              "Will",
+                                              " it",
+                                              " work",
+                                              "?",
+                                              "",
+                                              "</p>"};
+  CHECK(asTokens(response.source) == html_tokens_source);
+}
+
+TEST_CASE("Test self-closing tag (HTML5)") {
+  std::string input("<p>hello <img> <b>world</b> <u>and other <a href=\"#\">creatures</a></u></p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "hello  world and other creatures");  // Note double space between "hello" and "world"
+}
+
+TEST_CASE("Test self-closing tag (XHTML)") {
+  std::string input("<p>hello<img/>world</p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "hello world");  // <img/> introduced space
+}
+
+TEST_CASE("Test empty void tag at end of input") {
+  std::string input("hello <br>");
+  HTML html(std::move(input), true);
+  CHECK(input == "hello ");
+
+  Response response;
+  std::string sentence_str("hello ");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 4),  // 0.0 hell
+      string_view(sentence_str.data() + 4, 2),  // 0.1 o_
+      string_view(sentence_str.data() + 6, 0),  // 0.2 [EOS]
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(3)};
+
+  html.restore(response);
+  CHECK(response.source.text == "hello <br>");
+  CHECK(response.target.text == "hello <br>");
+}
+
+TEST_CASE("Test empty tag pair at end of input") {
+  std::string input("hello <u></u>");
+  HTML html(std::move(input), true);
+  CHECK(input == "hello ");
+
+  Response response;
+  std::string sentence_str("hello ");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 4),  // 0.0 hell
+      string_view(sentence_str.data() + 4, 2),  // 0.1 o_
+      string_view(sentence_str.data() + 6, 0),  // 0.2 [EOS]
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(3)};
+
+  html.restore(response);
+  CHECK(response.source.text == "hello <u></u>");
+  CHECK(response.target.text == "hello <u></u>");
+}
+
+TEST_CASE("Test empty self-closing pair at end of input in parent") {
+  std::string input("<p>hello <br></p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "hello ");
+}
+
+TEST_CASE("Test empty tag") {
+  std::string test_str(
+      "<p id=\"1\">hello <img id=\"1.1\"><span id=\"1.2\"><u id=\"1.2.1\"></u><b id=\"1.2.2\"></b><img "
+      "id=\"1.2.3\">world</span></p>");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello world");
+
+  Response response;
+
+  std::string sentence_str("hello world");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 4),   // 0.0 hell
+      string_view(sentence_str.data() + 4, 1),   // 0.1 o
+      string_view(sentence_str.data() + 5, 6),   // 0.2 _world
+      string_view(sentence_str.data() + 11, 0),  // 0.3 ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == test_str);
+  CHECK(response.target.text == test_str);
+}
+
+TEST_CASE("Test <script> element") {
+  std::string test_str("hello <script>alert(\"<foo>\");</script>world");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello \n\nworld");
+
+  Response response;
+  std::string sentence_str("hello \n\nworld");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 4),   // 0.0 hell
+      string_view(sentence_str.data() + 4, 2),   // 0.1 o_
+      string_view(sentence_str.data() + 6, 2),   // 0.2 \n\n
+      string_view(sentence_str.data() + 8, 5),   // 0.3 world
+      string_view(sentence_str.data() + 13, 0),  // 0.4 ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(5)};
+
+  html.restore(response);
+  CHECK(response.source.text == test_str);
+  CHECK(response.target.text == test_str);
+}
+
+TEST_CASE("Test comment") {
+  std::string test_str("foo <!-- <ignore> me -->bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo bar");
+
+  Response response;
+  std::string sentence_str("foo bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 4),  // _bar
+      string_view(sentence_str.data() + 7, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(3)};
+
+  html.restore(response);
+  CHECK(response.source.text == test_str);
+  CHECK(response.target.text == test_str);
+}
+
+TEST_CASE("Test <wbr> element") {
+  std::string test_str("hel<wbr>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
+TEST_CASE("Test <wbr> element (case-insensitive)") {
+  std::string test_str("hel<WBR>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
+TEST_CASE("Test ignored element (nested)") {
+  std::string test_str("foo <var><var>nested</var></var> bar");
+  std::string expected_str("foo  <var><var>nested</var></var>bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
+}
+
+TEST_CASE("Test ignored element (with entity)") {
+  std::string test_str("foo <var>&amp;</var> bar");
+  std::string expected_str("foo  <var>&amp;</var>bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
+}
+
+TEST_CASE("End-to-end translation", "[!mayfail]") {
+  std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "I like to drive this car.");
+
+  Response response;
+
+  // clang-format off
+  response.alignments = std::vector<std::vector<std::vector<float>>>{{
+    {0.982376,  0.00742467, 0.00682965, 0.00121767, 0.000848056,6.51436e-05,7.53791e-06,0.00123162},
+    {0.165639,  0.368694,   0.230394,   0.222476,   0.00349563, 0.00105052, 0.000603092,0.00764845},
+    {0.00493271,0.0805876,  0.0139988,  0.89116,    0.000928116,0.00200724, 0.000512013,0.00587302},
+    {0.0194648, 0.411029,   0.087059,   0.0477847,  0.26596,    0.111161,   0.000392092,0.0571499},
+    {0.00879706,0.492504,   0.0448291,  0.007779,   0.423114,   0.0125523,  0.00119587, 0.00922804},
+    {0.00181909,0.00603626, 0.0335758,  0.037193,   0.747266,   0.102497,   0.0585782,  0.0130341},
+    {4.1348e-06,0.000156165,2.16369e-05,0.00275059, 0.00183456, 0.992357,   0.0023765,  0.000499018},
+    {0.00149043,0.000719392,0.0168534,  0.00430164, 0.00200343, 0.0106381,  0.948566,   0.0154279},
+    {0.0903136, 0.0550843,  0.0699474,  0.0792285,  0.223006,   0.207565,   0.129241,   0.145614},
+  }};
+  // clang-format on
+
+  {
+    std::string sentence_str("I like to drive this car.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 1),   // 0.0 "I"
+        string_view(sentence_str.data() + 1, 5),   // 0.1 " like"
+        string_view(sentence_str.data() + 6, 3),   // 0.2 " to"
+        string_view(sentence_str.data() + 9, 6),   // 0.3 " drive"
+        string_view(sentence_str.data() + 15, 5),  // 0.4 " this"
+        string_view(sentence_str.data() + 20, 4),  // 0.5 " car"
+        string_view(sentence_str.data() + 24, 1),  // 0.6 "."
+        string_view(sentence_str.data() + 25, 0),  // 0.7 ""
+    };
+    response.source.appendSentence("", sentence.begin(), sentence.end());
+  }
+
+  {
+    std::string sentence_str("Ich fahre gerne dieses Auto.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 3),   // 0.0 "Ich"
+        string_view(sentence_str.data() + 3, 1),   // 0.1 " "
+        string_view(sentence_str.data() + 4, 4),   // 0.2 "fahr"
+        string_view(sentence_str.data() + 8, 1),   // 0.3 "e"
+        string_view(sentence_str.data() + 9, 6),   // 0.4 " gerne"
+        string_view(sentence_str.data() + 15, 7),  // 0.5 " dieses"
+        string_view(sentence_str.data() + 22, 5),  // 0.6 " Auto"
+        string_view(sentence_str.data() + 27, 1),  // 0.7 "."
+        string_view(sentence_str.data() + 28, 0),  // 0.8 ""
+    };
+    response.target.appendSentence("", sentence.begin(), sentence.end());
+  }
+
+  html.restore(response);
+
+  {
+    AnnotatedText source;
+    std::string sentence_str("<p>I <b>like</b> to <u>drive</u> this car.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 4),   // 0.0 "<p>I"
+        string_view(sentence_str.data() + 4, 8),   // 0.1 " <b>like"
+        string_view(sentence_str.data() + 12, 7),  // 0.2 "</b> to"
+        string_view(sentence_str.data() + 19, 9),  // 0.3 " <u>drive"
+        string_view(sentence_str.data() + 28, 9),  // 0.4 "</u> this"
+        string_view(sentence_str.data() + 37, 4),  // 0.5 " car"
+        string_view(sentence_str.data() + 41, 1),  // 0.6 "."
+        string_view(sentence_str.data() + 42, 0),  // 0.7 ""
+    };
+    source.appendSentence("", sentence.begin(), sentence.end());
+    source.appendEndingWhitespace("</p>");
+
+    CHECK(asTokens(response.source) == asTokens(source));
+  }
+
+  {
+    AnnotatedText target;
+    // Empty <b></b> because the space token after "Ich" has "<p><b>" markup, passed down from "<b>like</b>"
+    std::string sentence_str("<p>Ich <b></b><u>fahre</u> <b>gerne</b> dieses Auto.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 6),    // 0.0 "<p>Ich"
+        string_view(sentence_str.data() + 6, 4),    // 0.1 " <b>"
+        string_view(sentence_str.data() + 10, 11),  // 0.2 "</b><u>fahr"
+        string_view(sentence_str.data() + 21, 1),   // 0.3 "e"
+        string_view(sentence_str.data() + 22, 13),  // 0.4 "</u> <b>gerne"
+        string_view(sentence_str.data() + 35, 11),  // 0.5 "</b> dieses"
+        string_view(sentence_str.data() + 46, 5),   // 0.6 " Auto"
+        string_view(sentence_str.data() + 51, 1),   // 0.7 "."
+        string_view(sentence_str.data() + 52, 0),   // 0.8 ""
+    };
+    target.appendSentence("", sentence.begin(), sentence.end());
+    target.appendEndingWhitespace("</p>");
+
+    CHECK(asTokens(response.target) == asTokens(target));
+  }
+}
+
+TEST_CASE("End-to-end translation when no words with markup align", "[!mayfail]") {
+  std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "I like to drive this car.");
+
+  Response response;
+
+  // clang-format off
+  response.alignments = std::vector<std::vector<std::vector<float>>>{{
+    {0.5360, 0.4405, 0.0142, 0.0061, 0.0029, 0.0001, 0.0000, 0.0001},
+    {0.0451, 0.0602, 0.5120, 0.2584, 0.1145, 0.0062, 0.0019, 0.0017},
+    {0.0392, 0.0009, 0.6535, 0.2293, 0.0492, 0.0199, 0.0014, 0.0067},
+    {0.0007, 0.0036, 0.0112, 0.0118, 0.9209, 0.0449, 0.0050, 0.0019},
+    {0.0000, 0.0004, 0.0008, 0.0047, 0.0163, 0.9683, 0.0045, 0.0050},
+    {0.0011, 0.0046, 0.0039, 0.0090, 0.0023, 0.0024, 0.9648, 0.0119},
+    {0.0840, 0.0744, 0.1545, 0.1330, 0.1818, 0.1722, 0.0859, 0.1143},
+  }};
+  // clang-format on
+
+  {
+    std::string sentence_str("I like to drive this car.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 1),   // 0.0 "I"
+        string_view(sentence_str.data() + 1, 5),   // 0.1 " like"
+        string_view(sentence_str.data() + 6, 3),   // 0.2 " to"
+        string_view(sentence_str.data() + 9, 6),   // 0.3 " drive"
+        string_view(sentence_str.data() + 15, 5),  // 0.4 " this"
+        string_view(sentence_str.data() + 20, 4),  // 0.5 " car"
+        string_view(sentence_str.data() + 24, 1),  // 0.6 "."
+        string_view(sentence_str.data() + 25, 0),  // 0.7 [EOS]
+    };
+    response.source.appendSentence("", sentence.begin(), sentence.end());
+  }
+
+  {
+    std::string sentence_str("Rád řídím to auto.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 4),   // 0.0 "Rád"
+        string_view(sentence_str.data() + 4, 6),   // 0.1 " říd"
+        string_view(sentence_str.data() + 10, 3),  // 0.2 "ím"
+        string_view(sentence_str.data() + 13, 3),  // 0.3 "_to"
+        string_view(sentence_str.data() + 16, 5),  // 0.4 " auto"
+        string_view(sentence_str.data() + 21, 1),  // 0.5 "."
+        string_view(sentence_str.data() + 22, 0),  // 0.6 [EOS]
+    };
+    response.target.appendSentence("", sentence.begin(), sentence.end());
+  }
+
+  html.restore(response);
+
+  {
+    AnnotatedText source;
+    std::string sentence_str("<p>I <b>like</b> to <u>drive</u> this car.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 4),   // 0.0 "<p>I"
+        string_view(sentence_str.data() + 4, 8),   // 0.1 " <b>like"
+        string_view(sentence_str.data() + 12, 7),  // 0.2 "</b> to"
+        string_view(sentence_str.data() + 19, 9),  // 0.3 " <u>drive"
+        string_view(sentence_str.data() + 28, 9),  // 0.4 "</u> this"
+        string_view(sentence_str.data() + 37, 4),  // 0.5 " car"
+        string_view(sentence_str.data() + 41, 1),  // 0.6 "."
+        string_view(sentence_str.data() + 42, 0),  // 0.7 ""
+    };
+    source.appendSentence("", sentence.begin(), sentence.end());
+    source.appendEndingWhitespace("</p>");
+
+    CHECK(asTokens(response.source) == asTokens(source));
+  }
+
+  {
+    AnnotatedText target;
+    std::string sentence_str("<p>Rád <b></b>řídím <u></u>to auto.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 7),    // 0.0 "<p>Rád"
+        string_view(sentence_str.data() + 7, 13),   // 0.1 " <b></b>říd"
+        string_view(sentence_str.data() + 20, 3),   // 0.2 "ím"
+        string_view(sentence_str.data() + 23, 10),  // 0.3 "_<u></u>to"
+        string_view(sentence_str.data() + 33, 5),   // 0.4 " auto"
+        string_view(sentence_str.data() + 38, 1),   // 0.5 "."
+        string_view(sentence_str.data() + 39, 0),   // 0.6 [EOS]
+    };
+    target.appendSentence("", sentence.begin(), sentence.end());
+    target.appendEndingWhitespace("</p>");
+
+    CHECK(asTokens(response.target) == asTokens(target));
+  }
+}
+
+// TEST_CASE("")
\ No newline at end of file
diff --git a/inference/src/tests/units/html_tests.h b/inference/src/tests/units/html_tests.h
new file mode 100644
index 000000000..0407b65b2
--- /dev/null
+++ b/inference/src/tests/units/html_tests.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <ostream>
+
+#include "translator/definitions.h"
+
+std::ostream &operator<<(std::ostream &out, marian::bergamot::ByteRange const &b);
+
+std::ostream &operator<<(std::ostream &out,
+                         std::pair<marian::bergamot::ByteRange, marian::bergamot::ByteRange> const &b);
diff --git a/inference/src/tests/units/quality_estimator_tests.cpp b/inference/src/tests/units/quality_estimator_tests.cpp
new file mode 100644
index 000000000..e11c07a7b
--- /dev/null
+++ b/inference/src/tests/units/quality_estimator_tests.cpp
@@ -0,0 +1,62 @@
+#include "quality_estimator_tests.h"
+
+#include "catch.hpp"
+#include "translator/quality_estimator.h"
+
+using namespace marian::bergamot;
+
+SCENARIO("Logistic Regressor test", "[QualityEstimator]") {
+  GIVEN("A feature matrix") {
+    const std::vector<std::vector<float> > features = {{-0.3, -0.3, 1.0, -0.183683336},
+                                                       {-0.0001, -0.0001, 1.0, -0.183683336},
+                                                       {-0.002, -0.002, 1.0, -0.183683336},
+                                                       {-0.5, -0.5, 1.0, -0.183683336},
+                                                       {-0.15, -0.2, 2.0, -0.183683336}};
+
+    LogisticRegressorQualityEstimator::Matrix featureMatrix(features.size(), features.begin()->size());
+
+    for (int i = 0; i < features.size(); ++i) {
+      for (int j = 0; j < features.begin()->size(); ++j) {
+        featureMatrix.at(i, j) = features[i][j];
+      }
+    }
+
+    AND_GIVEN("A LogistRegressor") {
+      LogisticRegressorQualityEstimator::Array coefficients = {0.99000001, 0.899999976, -0.200000003, 0.5};
+      const float intercept = {-0.300000012};
+
+      LogisticRegressorQualityEstimator::Scale scale;
+      scale.stds = {0.200000003, 0.300000012, 2.5, 0.100000001};
+      scale.means = {-0.100000001, -0.769999981, 5, -0.5};
+
+      LogisticRegressorQualityEstimator lrQE(std::move(scale), std::move(coefficients), intercept);
+
+      WHEN("It's call predict") {
+        const std::vector<float> prediction = lrQE.predict(featureMatrix);
+
+        THEN("return the prediction") {
+          CHECK(prediction == std::vector<float>{-2.14596, -4.41793, -4.403, -0.93204, -3.03343});
+        }
+      }
+
+      WHEN("LR is construct by aligned memory") {
+        const auto lrQEAlignedMemory = LogisticRegressorQualityEstimator::fromAlignedMemory(lrQE.toAlignedMemory());
+
+        WHEN("It's call predict") {
+          const std::vector<float> prediction = lrQEAlignedMemory.predict(featureMatrix);
+
+          THEN("return the prediction") {
+            CHECK(prediction == std::vector<float>{-2.14596, -4.41793, -4.403, -0.93204, -3.03343});
+          }
+        }
+      }
+    }
+  }
+}
+
+bool operator==(const std::vector<float>& value1, const std::vector<float>& value2) {
+  return std::equal(value1.begin(), value1.end(), value2.begin(), value2.end(), [](const auto& a, const auto& b) {
+    auto value = Approx(b).epsilon(0.001);
+    return a == value;
+  });
+}
diff --git a/inference/src/tests/units/quality_estimator_tests.h b/inference/src/tests/units/quality_estimator_tests.h
new file mode 100644
index 000000000..37cba3ef3
--- /dev/null
+++ b/inference/src/tests/units/quality_estimator_tests.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <vector>
+
+bool operator==(const std::vector<float>& value1, const std::vector<float>& value2);
diff --git a/inference/src/tests/units/run_tests.cpp b/inference/src/tests/units/run_tests.cpp
new file mode 100644
index 000000000..0c7c351f4
--- /dev/null
+++ b/inference/src/tests/units/run_tests.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/inference/src/tests/units/xh_scanner_tests.cpp b/inference/src/tests/units/xh_scanner_tests.cpp
new file mode 100644
index 000000000..0fdfa3566
--- /dev/null
+++ b/inference/src/tests/units/xh_scanner_tests.cpp
@@ -0,0 +1,261 @@
+#include <string>
+
+#include "catch.hpp"
+#include "translator/xh_scanner.h"
+
+TEST_CASE("scan element with attributes") {
+  markup::instream in("<div id=\"test\" class=\"a b c \">");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "div");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "id");
+  CHECK(scanner.value() == "test");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "class");
+  CHECK(scanner.value() == "a b c ");
+
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan element with valueless attributes") {
+  markup::instream in("<input checked hidden>");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "input");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "checked");
+  CHECK(scanner.value() == "");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "hidden");
+  CHECK(scanner.value() == "");
+
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan element with unquoted attributes") {
+  markup::instream in("<div hidden=true class=test>");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "div");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "hidden");
+  CHECK(scanner.value() == "true");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "class");
+  CHECK(scanner.value() == "test");
+
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan element with spaces around attributes") {
+  markup::instream in("<input class = \"test\" checked type = checkbox >");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "input");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "class");
+  CHECK(scanner.value() == "test");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "checked");
+  CHECK(scanner.value() == "");
+
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "type");
+  CHECK(scanner.value() == "checkbox");
+
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan element with text") {
+  markup::instream in("<span>Hello world</span>");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "Hello world");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan html entities") {
+  markup::instream in("&amp;&apos;&nbsp;&quot;&lt;&gt;");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "&");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "'");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == " ");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "\"");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "<");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == ">");
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan & instead of &amp;") {
+  markup::instream in("Hello & other people");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "Hello ");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "&");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == " other people");
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan &notanentity;") {
+  markup::instream in("&notanentity;");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "&notanentity;");
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan nested elements") {
+  markup::instream in("<div><p><img></p></div>");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "div");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "p");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "img");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.tag() == "p");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.tag() == "div");
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan kitchen sink") {
+  std::string html_str =
+      "<div id=\"test-id\" class=\"a b c \">\n"
+      "<span x-custom-attribute=\"Hello &quot;world&quot;\"><!--\n"
+      "this is a comment -->this is &amp; text\n"
+      "</span></div>";
+  markup::instream in(html_str.data());
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "div");
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "id");
+  CHECK(scanner.value() == "test-id");
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "class");
+  CHECK(scanner.value() == "a b c ");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "\n");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "span");
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "x-custom-attribute");
+  CHECK(scanner.value() == "Hello &quot;world&quot;");  // We do not decode entities in attributes
+  CHECK(scanner.next() == markup::Scanner::TT_COMMENT_START);
+  CHECK(scanner.next() == markup::Scanner::TT_DATA);
+  CHECK(scanner.value() == "\nthis is a comment ");
+  CHECK(scanner.next() == markup::Scanner::TT_COMMENT_END);
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "this is ");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "&");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == " text\n");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.tag() == "span");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.tag() == "div");
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("test long text (#273)") {
+  std::string test_str;
+  for (size_t i = 0; i < 1024; ++i) test_str.append("testing ");
+
+  markup::instream in(test_str.data());
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == test_str);
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan self-closing element") {
+  markup::instream in("before <img src=\"#\"/> after");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == "before ");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "img");
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "src");
+  CHECK(scanner.value() == "#");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.tag() == "img");
+  CHECK(scanner.next() == markup::Scanner::TT_TEXT);
+  CHECK(scanner.value() == " after");
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan script") {
+  markup::instream in("<script async>true && document.body.length > 10</script>");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "script");
+  CHECK(scanner.next() == markup::Scanner::TT_ATTRIBUTE);
+  CHECK(scanner.attribute() == "async");
+  CHECK(scanner.value() == "");
+  CHECK(scanner.next() == markup::Scanner::TT_DATA);
+  CHECK(scanner.value() == "true && document.body.length > 10");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan style") {
+  markup::instream in("<style>body { background: url(test.png); }</style>");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_START);
+  CHECK(scanner.tag() == "style");
+  CHECK(scanner.next() == markup::Scanner::TT_DATA);
+  CHECK(scanner.value() == "body { background: url(test.png); }");
+  CHECK(scanner.next() == markup::Scanner::TT_TAG_END);
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
+
+TEST_CASE("scan processing instruction") {
+  // Based on https://searchfox.org/mozilla-central/source/dom/base/nsContentUtils.cpp#8961
+  // element.outerHTML can produce processing instructions in the html. These
+  // should be treated similar to <!-- foo -->.
+  markup::instream in("<?xml version=\"1.0\"?>");
+  markup::Scanner scanner(in);
+
+  CHECK(scanner.next() == markup::Scanner::TT_PROCESSING_INSTRUCTION_START);
+  CHECK(scanner.next() == markup::Scanner::TT_DATA);
+  CHECK(scanner.value() == "xml version=\"1.0\"");
+  CHECK(scanner.next() == markup::Scanner::TT_PROCESSING_INSTRUCTION_END);
+  CHECK(scanner.next() == markup::Scanner::TT_EOF);
+}
\ No newline at end of file
diff --git a/inference/src/tests/wasm.cpp b/inference/src/tests/wasm.cpp
new file mode 100644
index 000000000..97f0fc801
--- /dev/null
+++ b/inference/src/tests/wasm.cpp
@@ -0,0 +1,54 @@
+#include "common.h"
+using namespace marian::bergamot;
+
+void wasm(BlockingService &service, std::shared_ptr<TranslationModel> &model) {
+  std::vector<ResponseOptions> responseOptions;
+  std::vector<std::string> texts;
+
+  // WASM always requires HTML and alignment.
+  // TODO(jerinphilip): Fix this, bring in actual tests.
+  // responseOptions.HTML = true;
+  // responseOptions.alignment = true;  // Necessary for HTML
+
+  // Hide the translateMultiple operation
+  for (std::string line; std::getline(std::cin, line);) {
+    texts.emplace_back(line);
+    responseOptions.emplace_back();
+  }
+
+  auto results = service.translateMultiple(model, std::move(texts), responseOptions);
+
+  for (auto &result : results) {
+    std::cout << result.getTranslatedText() << std::endl;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  ConfigParser<BlockingService> configParser("WebAssembly test-suite", /*multiOpMode=*/true);
+  configParser.parseArgs(argc, argv);
+
+  auto &config = configParser.getConfig();
+  BlockingService service(config.serviceConfig);
+
+  TestSuite<BlockingService> testSuite(service);
+  std::vector<std::shared_ptr<TranslationModel>> models;
+
+  for (auto &modelConfigPath : config.modelConfigPaths) {
+    TranslationModel::Config modelConfig = parseOptionsFromFilePath(modelConfigPath);
+    // Anything WASM is expected to use the byte-array-loads. So we hard-code grabbing MemoryBundle from FS and use the
+    // MemoryBundle capable constructor.
+    MemoryBundle memoryBundle = getMemoryBundleFromConfig(modelConfig);
+    std::shared_ptr<TranslationModel> model = std::make_shared<TranslationModel>(modelConfig, std::move(memoryBundle));
+    models.push_back(model);
+  }
+
+  /// WASM is one special case where WASM path is being checked, involving translateMultiple and a multi-line feed.
+  /// Hence we do not bind it at a single input-blob single Response constraint imposed by the TestSuite.
+  if (config.opMode == "wasm") {
+    wasm(service, models.front());
+  } else {
+    testSuite.run(config.opMode, models);
+  }
+
+  return 0;
+}
diff --git a/inference/src/translator/CMakeLists.txt b/inference/src/translator/CMakeLists.txt
new file mode 100644
index 000000000..1d773b46b
--- /dev/null
+++ b/inference/src/translator/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Generate version file
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.h.in
+               ${CMAKE_CURRENT_BINARY_DIR}/project_version.h @ONLY)
+
+add_library(bergamot-translator STATIC
+    byte_array_util.cpp
+    text_processor.cpp
+    translation_model.cpp 
+    request.cpp 
+    batching_pool.cpp
+    aggregate_batching_pool.cpp
+    response_builder.cpp
+    quality_estimator.cpp
+    batch.cpp
+    annotation.cpp
+    service.cpp
+    parser.cpp
+    response.cpp
+    html.cpp
+    xh_scanner.cpp
+)
+if (USE_WASM_COMPATIBLE_SOURCE)
+  # Using wasm compatible sources should include this compile definition;
+  # Has to be done here because we are including marian headers + some sources
+  # in local repository use these definitions
+  target_compile_definitions(bergamot-translator PUBLIC USE_SSE2 WASM_COMPATIBLE_SOURCE)
+endif()
+
+if(COMPILE_WASM)
+  target_compile_definitions(bergamot-translator PUBLIC WASM)
+  # Enable code that is required for generating JS bindings
+  target_compile_definitions(bergamot-translator PRIVATE WASM_BINDINGS)
+  target_compile_options(bergamot-translator PRIVATE ${WASM_COMPILE_FLAGS})
+  target_link_options(bergamot-translator PRIVATE ${WASM_LINK_FLAGS})
+endif(COMPILE_WASM)
+
+if(ENABLE_CACHE_STATS)
+    target_compile_definitions(bergamot-translator PUBLIC ENABLE_CACHE_STATS)
+endif(ENABLE_CACHE_STATS)
+
+target_link_libraries(bergamot-translator marian ssplit)
+
+target_include_directories(bergamot-translator
+    PUBLIC ${PROJECT_SOURCE_DIR}
+           ${PROJECT_SOURCE_DIR}/src)
diff --git a/inference/src/translator/aggregate_batching_pool.cpp b/inference/src/translator/aggregate_batching_pool.cpp
new file mode 100644
index 000000000..5f405110a
--- /dev/null
+++ b/inference/src/translator/aggregate_batching_pool.cpp
@@ -0,0 +1,36 @@
+
+#include "aggregate_batching_pool.h"
+
+namespace marian {
+namespace bergamot {
+
+AggregateBatchingPool::AggregateBatchingPool() {
+  // TODO(@jerinphilip): Set aggregate limits
+}
+
+size_t AggregateBatchingPool::enqueueRequest(Ptr<TranslationModel> model, Ptr<Request> request) {
+  size_t sentencesEnqueued = model->enqueueRequest(request);
+  aggregateQueue_.insert(model);
+  return sentencesEnqueued;
+}
+
+size_t AggregateBatchingPool::generateBatch(Ptr<TranslationModel>& model, Batch& batch) {
+  while (!aggregateQueue_.empty()) {
+    auto candidateItr = aggregateQueue_.begin();
+    Ptr<TranslationModel> candidate = *candidateItr;
+    size_t numSentences = candidate->generateBatch(batch);
+    if (numSentences > 0) {
+      model = candidate;
+      return numSentences;
+    } else {
+      // Try the next model's batching pool.
+      aggregateQueue_.erase(candidateItr);
+    }
+  }
+  return /*numSentences=*/0;
+}
+
+void AggregateBatchingPool::clear() { aggregateQueue_.clear(); }
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/aggregate_batching_pool.h b/inference/src/translator/aggregate_batching_pool.h
new file mode 100644
index 000000000..6775591e0
--- /dev/null
+++ b/inference/src/translator/aggregate_batching_pool.h
@@ -0,0 +1,72 @@
+#ifndef SRC_BERGAMOT_AGGREGATE_BATCHING_POOL_H_
+#define SRC_BERGAMOT_AGGREGATE_BATCHING_POOL_H_
+
+#include <memory>
+#include <queue>
+
+#include "data/types.h"
+#include "translation_model.h"
+
+namespace marian {
+namespace bergamot {
+
+/// Hashes a pointer to an object using the address the pointer points to. If two pointers point to the same address,
+/// they hash to the same value.  Useful to put widely shared_ptrs of entities (eg: TranslationModel, Vocab, Shortlist)
+/// etc into containers which require the members to be hashable (std::unordered_set, std::unordered_map).
+template <class T>
+struct HashPtr {
+  size_t operator()(const std::shared_ptr<T>& t) const {
+    size_t address = reinterpret_cast<size_t>(t.get());
+    return std::hash<size_t>()(address);
+  }
+};
+
+/// Aggregates request queueing and generation of batches from multiple TranslationModels (BatchingPools within,
+/// specifically), thereby acting as an intermediary to enable multiple translation model capability in BlockingService
+/// and AsyncService.
+///
+/// A simple queue containing shared owning references to TranslationModels are held here from which batches are
+/// generated on demand. Since a queue is involved, the ordering is first-come first serve on requests except there are
+/// leaks effectively doing priority inversion if an earlier request with the same TranslationModel is pending
+/// to be consumed for translation.
+//
+/// Actual storage for the request and batch generation are within the respective TranslationModels, which owns its own
+/// BatchingPool.
+///
+/// Matches API provided by BatchingPool except arguments additionally parameterized by TranslationModel.
+///
+/// Note: This class is not thread-safe. You may use this class wrapped with ThreadsafeBatchingPool for a thread-safe
+/// equivalent of this class, if needed.
+class AggregateBatchingPool {
+ public:
+  /// Create an AggregateBatchingPool with (tentatively) global (across all BatchingPools) limits
+  /// imposed here.
+  AggregateBatchingPool();
+
+  /// Enqueue an existing request onto model, also keep account of that this model and request are now pending.
+  ///
+  /// @param [in] model: Model to use in translation. A shared ownership to this model is accepted by this object to
+  /// keep the model alive until translation is complete.
+  /// @param [in] request: A request to be enqueued to model.
+  /// @returns number of sentences added for translation.
+  size_t enqueueRequest(Ptr<TranslationModel> model, Ptr<Request> request);
+
+  /// Generate a batch from pending requests, obtained from available TranslationModels.
+  ///
+  /// @param [out] model: TranslationModel
+  /// @param [out] batch: Batch to write onto, which is consumed at translation elsewhere.
+  /// @returns Number of sentences in the generated batch.
+  size_t generateBatch(Ptr<TranslationModel>& model, Batch& batch);
+
+  /// Clear the aggregate queue. Does not clear the underlying model/request pairs but the next call
+  /// to `generateBatch()` will return 0. (Unless `enqueueRequest()` was called in the mean time.)
+  void clear();
+
+ private:
+  std::unordered_set<std::shared_ptr<TranslationModel>, HashPtr<TranslationModel>> aggregateQueue_;
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  //  SRC_BERGAMOT_AGGREGATE_BATCHING_POOL_H_
diff --git a/inference/src/translator/aligned.h b/inference/src/translator/aligned.h
new file mode 100644
index 000000000..73e82edc3
--- /dev/null
+++ b/inference/src/translator/aligned.h
@@ -0,0 +1,92 @@
+#pragma once
+#include <cstdlib>
+#include <new>
+#ifdef _MSC_VER
+// Ensure _HAS_EXCEPTIONS is defined
+#include <vcruntime.h>
+#include <malloc.h>
+#endif
+
+#if !((defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS))
+#include <cstdlib>
+#endif
+
+// Aligned simple vector.
+
+namespace marian {
+namespace bergamot {
+
+template <class T> class AlignedVector {
+  public:
+    AlignedVector() : mem_(nullptr), size_(0) {}
+
+    explicit AlignedVector(std::size_t size, std::size_t alignment = 64 /* CPU cares about this */)
+      : size_(size) {
+#ifdef _MSC_VER
+      mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
+      if (!mem_) {
+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
+        throw std::bad_alloc();
+#  else
+        std::abort();
+#  endif
+      }
+#else
+      if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
+        throw std::bad_alloc();
+#  else
+        std::abort();
+#  endif
+      }
+#endif
+    }
+
+    AlignedVector(AlignedVector &&from) : mem_(from.mem_), size_(from.size_) {
+      from.mem_ = nullptr;
+      from.size_ = 0;
+    }
+
+    AlignedVector &operator=(AlignedVector &&from) {
+      if (this == &from) return *this;
+      release();
+      mem_ = from.mem_;
+      size_ = from.size_;
+      from.mem_ = nullptr;
+      from.size_ = 0;
+      return *this;
+    }
+
+    AlignedVector(const AlignedVector&) = delete;
+    AlignedVector& operator=(const AlignedVector&) = delete;
+
+    ~AlignedVector() { release(); }
+
+    std::size_t size() const { return size_; }
+
+    T &operator[](std::size_t offset) { return mem_[offset]; }
+    const T &operator[](std::size_t offset) const { return mem_[offset]; }
+
+    T *begin() { return mem_; }
+    const T *begin() const { return mem_; }
+    T *end() { return mem_ + size_; }
+    const T *end() const { return mem_ + size_; }
+
+    template <typename ReturnType>
+    ReturnType *as() { return reinterpret_cast<ReturnType*>(mem_); }
+
+  private:
+    T *mem_;
+    std::size_t size_;
+
+    void release() {
+#ifdef _MSC_VER
+      _aligned_free(mem_);
+#else
+      std::free(mem_);
+#endif
+    }
+};
+
+} // namespace bergamot
+} // namespace marian
diff --git a/inference/src/translator/annotation.cpp b/inference/src/translator/annotation.cpp
new file mode 100644
index 000000000..e05a6a77d
--- /dev/null
+++ b/inference/src/translator/annotation.cpp
@@ -0,0 +1,70 @@
+#include "annotation.h"
+
+#include <cassert>
+
+namespace marian {
+namespace bergamot {
+
+AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) {
+  // Treat the entire text as a gap that recordExistingSentence will break.
+  annotation.token_begin_.back() = text.size();
+}
+
+void AnnotatedText::appendSentence(string_view prefix, std::vector<string_view>::iterator begin,
+                                   std::vector<string_view>::iterator end) {
+  assert(annotation.token_begin_.back() == text.size());
+
+  // prefix is just end of the previous one.
+  appendEndingWhitespace(prefix);
+
+  // Appending sentence text.
+  std::size_t offset = text.size();
+  for (std::vector<string_view>::iterator token = begin; token != end; ++token) {
+    offset += token->size();
+    annotation.token_begin_.push_back(offset);
+  }
+  if (begin != end) {
+    text.append(begin->data(), (end - 1)->data() + (end - 1)->size());
+    assert(offset == text.size());  // Tokens should be contiguous.
+  }
+
+  // Add the gap after the sentence.  This is empty for now, but will be
+  // extended with appendEndingWhitespace or another appendSentence.
+  annotation.gap_.push_back(annotation.token_begin_.size() - 1);
+  annotation.token_begin_.push_back(offset);
+}
+
+void AnnotatedText::appendEndingWhitespace(string_view whitespace) {
+  text.append(whitespace.data(), whitespace.size());
+  annotation.token_begin_.back() = text.size();
+}
+
+void AnnotatedText::recordExistingSentence(std::vector<string_view>::iterator begin,
+                                           std::vector<string_view>::iterator end, const char *sentence_begin) {
+  assert(sentence_begin >= text.data());
+  assert(sentence_begin <= text.data() + text.size());
+  assert(begin == end || sentence_begin == begin->data());
+  assert(!annotation.token_begin_.empty());
+  assert(annotation.token_begin_.back() == text.size());
+  // Clip off size token ending.
+  annotation.token_begin_.pop_back();
+  for (std::vector<string_view>::iterator i = begin; i != end; ++i) {
+    assert(i->data() >= text.data());                                  // In range.
+    assert(i->data() + i->size() <= text.data() + text.size());        // In range
+    assert(i + 1 == end || i->data() + i->size() == (i + 1)->data());  // Contiguous
+    annotation.token_begin_.push_back(i->data() - text.data());
+  }
+  // Gap token after sentence.
+  annotation.gap_.push_back(annotation.token_begin_.size());
+  if (begin != end) {
+    annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data());
+  } else {
+    // empty sentence.
+    annotation.token_begin_.push_back(sentence_begin - text.data());
+  }
+  // Add back size token ending.
+  annotation.token_begin_.push_back(text.size());
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/annotation.h b/inference/src/translator/annotation.h
new file mode 100644
index 000000000..5a17dfcfe
--- /dev/null
+++ b/inference/src/translator/annotation.h
@@ -0,0 +1,232 @@
+#ifndef BERGAMOT_SENTENCE_RANGES_H_
+#define BERGAMOT_SENTENCE_RANGES_H_
+
+#include <cassert>
+#include <utility>
+#include <vector>
+
+#include "data/types.h"
+#include "definitions.h"
+
+namespace marian {
+namespace bergamot {
+
+/// Annotation expresses sentence and token boundary information as ranges of
+/// bytes in a string, but does not itself own the string.
+///
+/// See also AnnotatedText, which owns Annotation and the string. AnnotatedText
+/// wraps these ByteRange functions to provide a string_view interface.
+///
+/// Text is divided into gaps (whitespace between sentences) and sentences like
+/// so:
+///   gap sentence gap sentence gap
+/// Because gaps appear at the beginning and end of the text, there's always
+/// one more gap than there are sentences.
+///
+/// The entire text is a unbroken sequence of tokens (i.e. the end of a token
+/// is the beginning of the next token).  A gap is exactly one token containing
+/// whatever whitespace is between the sentences.  A sentence is a sequence of
+/// tokens.
+///
+/// Since we are using SentencePiece, a token can include whitespace.  The term
+/// "word" is used, somewhat incorrectly, as a synonym of token.
+///
+/// A gap can be empty (for example there may not have been whitespace at the
+/// beginning).  A sentence can also be empty (typically the translation system
+/// produced empty output).  That's fine, these are just empty ranges as you
+/// would expect.
+class Annotation {
+ public:
+  /// Initially an empty string.  Populated by AnnotatedText.
+  Annotation() {
+    token_begin_.push_back(0);
+    token_begin_.push_back(0);
+    gap_.push_back(0);
+  }
+
+  size_t numSentences() const { return gap_.size() - 1; }
+
+  /// Returns number of words in the sentence identified by `sentenceIdx`.
+  size_t numWords(size_t sentenceIdx) const {
+    return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */;
+  }
+
+  /// Returns a ByteRange representing `wordIdx` in sentence indexed by
+  /// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
+  /// `.numWords()` for `sentenceIdx` for defined behaviour.
+  ByteRange word(size_t sentenceIdx, size_t wordIdx) const {
+    size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx;
+    return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
+  }
+
+  /// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
+  /// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
+  /// less than `.numSentences()`.
+  ByteRange sentence(size_t sentenceIdx) const {
+    return ByteRange{
+        token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */
+        token_begin_[gap_[sentenceIdx + 1]]  /*beginning of whitespace after */
+    };
+  }
+
+  ByteRange gap(size_t gapIdx) const {
+    size_t tokenIdx = gap_[gapIdx];
+    return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
+  }
+
+ private:
+  friend class AnnotatedText;
+  /// Map from token index to byte offset at which it begins.  Token i is:
+  ///   [token_begin_[i], token_begin_[i+1])
+  /// The vector is padded so that these indices are always valid, even at the
+  /// end.  So tokens_begin_.size() is the number of tokens plus 1.
+  std::vector<size_t> token_begin_;
+
+  /// Indices of tokens that correspond to gaps between sentences.  These are
+  /// indices into token_begin_.
+  /// Gap g is byte range:
+  ///   [token_begin_[gap_[w]], token_begin_[gap_[w]+1])
+  /// Sentence s is byte range:
+  ///   [token_begin_[gap_[s]+1], token_begin_[gap_[s+1]])
+  /// A sentence does not include whitespace at the beginning or end.
+  ///
+  /// gap_.size() == numSentences() + 1.
+  ///
+  /// Example: empty text "" -> just an empty gap.
+  /// token_begin_ = {0, 0};
+  /// gap_ = {0};
+  ///
+  /// Example: only space " " -> just a gap containing the space.
+  /// token_begin_ = {0, 1};
+  /// gap_ = {0};
+  ///
+  /// Example: one token "hi" -> empty gap, sentence with one token, empty gap
+  /// token_begin_ = {0, 0, 2, 2};
+  /// gap_ = {0, 2};
+  std::vector<size_t> gap_;
+};
+
+/// AnnotatedText is effectively std::string text + Annotation, providing the
+/// following additional desiderata.
+///
+/// 1. Access to processed string_views for convenience rather than ByteRanges
+/// (which only provides index information).
+///
+/// 2. Transparently convert string_views into ByteRanges for the Annotation
+/// referring to the text bound by this structure.
+///
+/// 3. Bind the text and annotations together, to move around as a meaningful
+/// unit.
+struct AnnotatedText {
+ public:
+  std::string text;       ///< Blob of string elements in annotation refers to.
+  Annotation annotation;  ///< sentence and (sub-) word annotations.
+
+  /// Construct an empty AnnotatedText. This is useful when the target string or
+  /// ByteRanges are not known yet, but the public members can be used to
+  /// populate it. One use-case, when translated-text is created decoding from
+  /// histories and the ByteRanges only known after the string has been
+  /// constructed.
+  AnnotatedText() {}
+
+  /// Construct moving in a string (for efficiency purposes, copying string
+  /// constructor is disallowed).
+  AnnotatedText(std::string &&text);
+
+  /// Appends a sentence to the existing text and transparently rebases
+  /// string_views.  Since this tracks only prefix, remember
+  /// appendEndingWhitespace.
+  /// The string_views must not already be in text.
+  void appendSentence(string_view prefix, std::vector<string_view>::iterator tokens_begin,
+                      std::vector<string_view>::iterator tokens_end);
+
+  /// Append the whitespace at the end of input. string_view must not be in
+  /// text.
+  void appendEndingWhitespace(string_view whitespace);
+
+  /// Record the existence of a sentence that is already in text.  The
+  /// iterators are over string_views for each token that must be in text
+  /// already.  This function must be called to record sentences in order.
+  /// Normally the beginning of the sentence can be inferred from
+  /// tokens_begin->data() but the tokens could be empty, so sentence_begin is
+  /// required to know where the sentence is.
+  void recordExistingSentence(std::vector<string_view>::iterator tokens_begin,
+                              std::vector<string_view>::iterator tokens_end, const char *sentence_begin);
+
+  /// Returns the number of sentences in the annotation structure.
+  const size_t numSentences() const { return annotation.numSentences(); }
+
+  /// Returns number of words in the sentece identified by sentenceIdx.
+  const size_t numWords(size_t sentenceIdx) const { return annotation.numWords(sentenceIdx); }
+
+  /// Returns a string_view representing wordIdx in sentenceIdx
+  string_view word(size_t sentenceIdx, size_t wordIdx) const {
+    return asStringView(annotation.word(sentenceIdx, wordIdx));
+  }
+
+  /// Returns a string_view representing sentence corresponding to sentenceIdx.
+  string_view sentence(size_t sentenceIdx) const { return asStringView(annotation.sentence(sentenceIdx)); }
+
+  /// Returns the string_view of the gap between two sentences in the container.
+  ///
+  /// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
+  ///
+  /// * For `i = 0`: The gap between the start of text and the 0th sentence.
+  /// * For `i = 1...N-1`, returns the text comprising of the gap
+  ///   between the `i`-th and `i+1`-th sentence.
+  /// * For `i = N`, the gap between the last (N-1th) sentence and end of
+  ///   text.
+  /// @param sentenceIdx: Can be between `[0, numSentences()]`.
+  string_view gap(size_t sentenceIdx) const { return asStringView(annotation.gap(sentenceIdx)); }
+
+  /// Returns a ByteRange representing wordIdx in sentenceIdx
+  ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { return annotation.word(sentenceIdx, wordIdx); }
+
+  /// Returns a ByteRange representing sentence corresponding to sentenceIdx.
+  ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); }
+
+  /// Utility function to call `fun` on each word (subword token effectively) in
+  /// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
+  /// with the word, and a `bool` to indicate whether it is the last word in the
+  /// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText.
+  template <typename Fun>
+  AnnotatedText apply(Fun fun) const {
+    AnnotatedText out;
+
+    for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) {
+      std::string sentence;
+      std::vector<ByteRange> tokens;
+
+      std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false);
+
+      for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) {
+        std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false);
+        tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
+        sentence += token;
+      }
+
+      // Convert our ByteRanges to string_views since that's what appendSentence
+      // expects
+      std::vector<marian::string_view> views(tokens.size());
+      std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
+        return marian::string_view(sentence.data() + range.begin, range.size());
+      });
+
+      out.appendSentence(prefix, views.begin(), views.end());
+    }
+
+    out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true));
+
+    return out;
+  }
+
+ private:
+  string_view asStringView(const ByteRange &byteRange) const {
+    return string_view(text.data() + byteRange.begin, byteRange.size());
+  }
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  //  BERGAMOT_SENTENCE_RANGES_H_
diff --git a/inference/src/translator/batch.cpp b/inference/src/translator/batch.cpp
new file mode 100644
index 000000000..08d3d02c6
--- /dev/null
+++ b/inference/src/translator/batch.cpp
@@ -0,0 +1,26 @@
+#include "batch.h"
+
+#include "request.h"
+
+namespace marian {
+namespace bergamot {
+
+void Batch::log() {
+  size_t numTokens{0}, maxLength{0};
+  for (auto &sentence : sentences_) {
+    numTokens += sentence.numTokens();
+    maxLength = std::max(maxLength, static_cast<size_t>(sentence.numTokens()));
+  }
+
+  LOG(info, "Batch(tokens={}, max-length={}, sentences_={})", numTokens, maxLength, sentences_.size());
+}
+
+void Batch::add(const RequestSentence &sentence) { sentences_.push_back(sentence); }
+
+void Batch::completeBatch(const Histories &histories) {
+  for (size_t i = 0; i < sentences_.size(); i++) {
+    sentences_[i].completeSentence(histories[i]);
+  }
+}
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/batch.h b/inference/src/translator/batch.h
new file mode 100644
index 000000000..2f67252be
--- /dev/null
+++ b/inference/src/translator/batch.h
@@ -0,0 +1,43 @@
+#ifndef SRC_BERGAMOT_BATCH_H
+#define SRC_BERGAMOT_BATCH_H
+
+#include "request.h"
+#include "translator/beam_search.h"
+
+namespace marian {
+namespace bergamot {
+
+// An empty batch is poison.
+class Batch {
+ public:
+  Batch() {}
+  void clear() { sentences_.clear(); }
+
+  size_t size() const { return sentences_.size(); }
+
+  void add(const RequestSentence &sentence);
+
+  // Accessors to read from a Batch. For use in BatchTranslator (consumer on a
+  // PCQueue holding batches).
+  //
+  // sentences() are used to access sentences to construct marian internal
+  // batch.
+  const RequestSentences &sentences() { return sentences_; }
+
+  // On obtaining Histories after translating a batch, completeBatch can be
+  // called with Histories , which forwards the call to Request through
+  // RequestSentence and triggers completion, by setting the promised value to
+  // the future given to client.
+  void completeBatch(const Histories &histories);
+
+  // Convenience function to log batch-statistics. numTokens, max-length.
+  void log();
+
+ private:
+  RequestSentences sentences_;
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  // SRC_BERGAMOT_BATCH_H_
diff --git a/inference/src/translator/batching_pool.cpp b/inference/src/translator/batching_pool.cpp
new file mode 100644
index 000000000..61dd1920e
--- /dev/null
+++ b/inference/src/translator/batching_pool.cpp
@@ -0,0 +1,89 @@
+#include "batching_pool.h"
+
+#include <cassert>
+
+#include "batch.h"
+#include "common/logging.h"
+
+namespace marian {
+namespace bergamot {
+
+BatchingPool::BatchingPool(Ptr<Options> options)
+    : miniBatchWords_(options->get<int>("mini-batch-words")), maxActiveBucketLength_(0) {
+  size_t maxLengthBreak = options->get<int>("max-length-break");
+  float maxLengthFactor = options->get<float>("max-length-factor", 3.0);
+
+  // For the time being, we add some slack, which only BatchingPool is aware of. Since the TextProcessor still wraps at
+  // first request in, most of the Batches generated will be under max-length break.
+  //
+  // In the unlikely event of a few sentences overflowing, this allows the exceeding words to be put in the slack area.
+  // Very few batches are expected to be generated at a higher length.
+  size_t pivotSlack = maxLengthBreak * maxLengthFactor - maxLengthBreak;
+  bucket_.resize(maxLengthBreak + pivotSlack + 1);
+
+  ABORT_IF(bucket_.size() - 1 > miniBatchWords_,
+           "Fatal: max-length-break > mini-batch-words  will lead to sentences "
+           "longer than what can fit in a batch.");
+}
+
+size_t BatchingPool::generateBatch(Batch &batch) {
+  // For now simply iterates on buckets and converts batches greedily.  This
+  // has to be enhanced with optimizing over priority. The baseline
+  // implementation should at least be as fast as marian's maxi-batch with full
+  // corpus size as maxi-batch size.
+  batch.clear();
+  size_t paddedBatchSize = 0;
+
+  for (size_t length = 0; length <= maxActiveBucketLength_; length++) {
+    auto p = bucket_[length].begin();
+    while (p != bucket_[length].end()) {
+      paddedBatchSize = (batch.size() + 1) * length;
+      if (paddedBatchSize <= miniBatchWords_) {
+        auto q = p++;
+        batch.add(*q);
+        bucket_[length].erase(q);
+      } else {
+        // Check if elements exist
+        assert(batch.size() > 0);
+        return batch.size();
+      }
+    }
+  }
+
+  return batch.size();
+}
+
+size_t BatchingPool::enqueueRequest(Ptr<Request> request) {
+  size_t toBeFreshlyTranslated = 0;
+  for (size_t i = 0; i < request->numSegments(); i++) {
+    if (!request->cacheHitPrefilled(i)) {
+      RequestSentence sentence(i, request);
+      size_t bucket_id = sentence.numTokens();
+
+      // Due to a workaround for pivoting, unless we can discipline the
+      // vocabulary to get stronger static requirements, it is difficult to
+      // rework the rest of the components. Instead, we allow dynamic growth
+      // here. We let std::vector take care of the dynamic growth.
+      // https://en.cppreference.com/w/cpp/container/vector/resize#Complexity
+      if (bucket_id >= bucket_.size()) {
+        bucket_.resize(bucket_id + 1);
+      }
+
+      bucket_[bucket_id].insert(sentence);
+      maxActiveBucketLength_ = std::max<size_t>(bucket_id, maxActiveBucketLength_);
+
+      toBeFreshlyTranslated += 1;
+    }
+  }
+
+  return toBeFreshlyTranslated;
+}
+
+void BatchingPool::clear() {
+  for (size_t length = 0; length < bucket_.size(); length++) {
+    bucket_[length].clear();
+  }
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/batching_pool.h b/inference/src/translator/batching_pool.h
new file mode 100644
index 000000000..58cd2ca8b
--- /dev/null
+++ b/inference/src/translator/batching_pool.h
@@ -0,0 +1,42 @@
+#ifndef SRC_BERGAMOT_BATCHING_POOL_H_
+#define SRC_BERGAMOT_BATCHING_POOL_H_
+
+#include <set>
+#include <vector>
+
+#include "batch.h"
+#include "common/options.h"
+#include "data/corpus_base.h"
+#include "definitions.h"
+#include "request.h"
+
+namespace marian {
+namespace bergamot {
+
+class BatchingPool {
+ public:
+  explicit BatchingPool(Ptr<Options> options);
+
+  // RequestSentence incorporates (tentative) notions of priority with each
+  // sentence. This method inserts the sentence into the internal data-structure
+  // which maintains priority among sentences from multiple concurrent requests.
+  size_t enqueueRequest(Ptr<Request> request);
+
+  // Loads sentences with sentences compiled from (tentatively) multiple
+  // requests optimizing for both padding and priority.
+  size_t generateBatch(Batch &batch);
+
+  // Removes any pending requests from the pool.
+  void clear();
+
+ private:
+  size_t miniBatchWords_;
+  std::vector<std::set<RequestSentence>> bucket_;
+  size_t batchNumber_{0};
+  size_t maxActiveBucketLength_;
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  // SRC_BERGAMOT_BATCHING_POOL_H_
diff --git a/inference/src/translator/byte_array_util.cpp b/inference/src/translator/byte_array_util.cpp
new file mode 100644
index 000000000..c7515e797
--- /dev/null
+++ b/inference/src/translator/byte_array_util.cpp
@@ -0,0 +1,178 @@
+#include "byte_array_util.h"
+
+#include <cstdlib>
+#include <memory>
+
+#include "common/io.h"
+#include "data/shortlist.h"
+
+namespace marian {
+namespace bergamot {
+
+namespace {
+// This is a basic validator that checks if the file has not been truncated
+// it basically loads up the header and checks
+
+// This struct and the getter are copied from the marian source, because it's located
+// inside src/common/binary.cpp:15 and we can't include it.
+struct Header {
+  uint64_t nameLength;
+  uint64_t type;
+  uint64_t shapeLength;
+  uint64_t dataLength;
+};
+
+// cast current void pointer to T pointer and move forward by num elements
+template <typename T>
+const T* get(const void*& current, uint64_t num = 1) {
+  const T* ptr = (const T*)current;
+  current = (const T*)current + num;
+  return ptr;
+}
+}  // Anonymous namespace
+
+bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize) {
+  const void* current = model.begin();
+  uint64_t memoryNeeded =
+      sizeof(uint64_t) * 2;  // We keep track of how much memory we would need if we have a complete file
+  uint64_t numHeaders;
+  if (fileSize >= memoryNeeded) {  // We have enough filesize to fetch the headers.
+    uint64_t binaryFileVersion = *get<uint64_t>(current);
+    numHeaders = *get<uint64_t>(current);  // number of item headers that follow
+  } else {
+    return false;
+  }
+  memoryNeeded += numHeaders * sizeof(Header);
+  const Header* headers;
+  if (fileSize >= memoryNeeded) {
+    headers = get<Header>(current, numHeaders);  // read that many headers
+  } else {
+    return false;
+  }
+
+  // Calculate how many bytes we are going to for reading just the names and the shape
+  for (uint64_t i = 0; i < numHeaders; i++) {
+    memoryNeeded += headers[i].nameLength + headers[i].shapeLength * sizeof(int);
+    // Advance the pointers.
+    get<char>(current, headers[i].nameLength);
+    get<int>(current, headers[i].shapeLength);
+  }
+
+  // Before we start reading the data, there is a small padding to ensure alignment
+  // Read that in, before calculating the actual tensor memory requirements.
+  uint64_t aligned_offset;
+  if (fileSize >= memoryNeeded) {
+    aligned_offset = *get<uint64_t>(current);  // Offset to align memory to 256 size
+    memoryNeeded += aligned_offset + sizeof(uint64_t);
+  } else {
+    return false;
+  }
+
+  // Finally the tensor size:
+  for (uint64_t i = 0; i < numHeaders; i++) {
+    memoryNeeded += headers[i].dataLength;
+  }
+
+  // If this final check passes, the file is at least big enough to contain the model
+  if (fileSize >= memoryNeeded) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+AlignedMemory loadFileToMemory(const std::string& path, size_t alignment) {
+  uint64_t fileSize = filesystem::fileSize(path);
+  io::InputFileStream in(path);
+  ABORT_IF(in.bad(), "Failed opening file stream: {}", path);
+  AlignedMemory alignedMemory(fileSize, alignment);
+  in.read(reinterpret_cast<char*>(alignedMemory.begin()), fileSize);
+  ABORT_IF(alignedMemory.size() != fileSize, "Error reading file {}", path);
+  return alignedMemory;
+}
+
+std::vector<AlignedMemory> getModelMemoryFromConfig(marian::Ptr<marian::Options> options) {
+  auto models = options->get<std::vector<std::string>>("models");
+
+  std::vector<AlignedMemory> modelMemories(models.size());
+  for (size_t i = 0; i < models.size(); ++i) {
+    const auto model = models[i];
+    if (marian::io::isBin(model)) {
+      modelMemories[i] = loadFileToMemory(model, 256);
+    } else if (marian::io::isNpz(model)) {
+      // if any of the models are npz format, we revert to loading from file for all models.
+      LOG(debug, "Encountered an npz file {}; will use file loading for {} models", model, models.size());
+      return {};
+    } else {
+      ABORT("Unknown extension for model: {}, should be one of `.bin` or `.npz`", model);
+    }
+  }
+
+  return modelMemories;
+}
+
+AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options) {
+  auto shortlist = options->get<std::vector<std::string>>("shortlist");
+  if (!shortlist.empty()) {
+    ABORT_IF(!marian::data::isBinaryShortlist(shortlist[0]),
+             "Loading non-binary shortlist file into memory is not supported");
+    return loadFileToMemory(shortlist[0], 64);
+  }
+  return AlignedMemory();
+}
+
+void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
+                               std::vector<std::shared_ptr<AlignedMemory>>& vocabMemories) {
+  auto vfiles = options->get<std::vector<std::string>>("vocabs");
+  ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies.");
+  vocabMemories.resize(vfiles.size());
+  std::unordered_map<std::string, std::shared_ptr<AlignedMemory>> vocabMap;
+  for (size_t i = 0; i < vfiles.size(); ++i) {
+    ABORT_IF(marian::filesystem::Path(vfiles[i]).extension() != marian::filesystem::Path(".spm"),
+             "Loading non-SentencePiece vocab files into memory is not supported");
+    auto m = vocabMap.emplace(std::make_pair(vfiles[i], std::shared_ptr<AlignedMemory>()));
+    if (m.second) {
+      m.first->second = std::make_shared<AlignedMemory>(loadFileToMemory(vfiles[i], 64));
+    }
+    vocabMemories[i] = m.first->second;
+  }
+}
+
+AlignedMemory getQualityEstimatorModel(const marian::Ptr<marian::Options>& options) {
+  const auto qualityEstimatorPath = options->get<std::string>("quality", "");
+  if (qualityEstimatorPath.empty()) {
+    return {};
+  }
+  return loadFileToMemory(qualityEstimatorPath, 64);
+}
+
+AlignedMemory getQualityEstimatorModel(MemoryBundle& memoryBundle, const marian::Ptr<marian::Options>& options) {
+  if (memoryBundle.qualityEstimatorMemory.size() == 0) {
+    return getQualityEstimatorModel(options);
+  }
+
+  return std::move(memoryBundle.qualityEstimatorMemory);
+}
+
+MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options) {
+  MemoryBundle memoryBundle;
+  memoryBundle.models = getModelMemoryFromConfig(options);
+  memoryBundle.shortlist = getShortlistMemoryFromConfig(options);
+  getVocabsMemoryFromConfig(options, memoryBundle.vocabs);
+  memoryBundle.ssplitPrefixFile = getSsplitPrefixFileMemoryFromConfig(options);
+  memoryBundle.qualityEstimatorMemory = getQualityEstimatorModel(options);
+
+  return memoryBundle;
+}
+
+AlignedMemory getSsplitPrefixFileMemoryFromConfig(marian::Ptr<marian::Options> options) {
+  std::string fpath = options->get<std::string>("ssplit-prefix-file", "");
+  if (!fpath.empty()) {
+    return loadFileToMemory(fpath, 64);
+  }
+  // Return empty AlignedMemory
+  return AlignedMemory();
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/byte_array_util.h b/inference/src/translator/byte_array_util.h
new file mode 100644
index 000000000..851a175fd
--- /dev/null
+++ b/inference/src/translator/byte_array_util.h
@@ -0,0 +1,18 @@
+#include "definitions.h"
+#include "marian.h"
+
+namespace marian {
+namespace bergamot {
+
+AlignedMemory loadFileToMemory(const std::string& path, size_t alignment);
+std::vector<AlignedMemory> getModelMemoryFromConfig(marian::Ptr<marian::Options> options);
+AlignedMemory getQualityEstimatorModel(const marian::Ptr<marian::Options>& options);
+AlignedMemory getQualityEstimatorModel(MemoryBundle& memoryBundle, const marian::Ptr<marian::Options>& options);
+AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options);
+AlignedMemory getSsplitPrefixFileMemoryFromConfig(marian::Ptr<marian::Options> options);
+void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
+                               std::vector<std::shared_ptr<AlignedMemory>>& vocabMemories);
+bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize);
+MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options);
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/cache.h b/inference/src/translator/cache.h
new file mode 100644
index 000000000..ceeca5d32
--- /dev/null
+++ b/inference/src/translator/cache.h
@@ -0,0 +1,91 @@
+#pragma once
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "definitions.h"
+#include "translator/history.h"
+
+namespace marian::bergamot {
+
+template <class Key, class Value, class Hash = std::hash<Key>, class Equals = std::equal_to<Key>>
+class AtomicCache {
+ public:
+  struct Stats {
+    size_t hits{0};
+    size_t misses{0};
+  };
+
+  explicit AtomicCache(size_t size, size_t buckets) : records_(size), mutexBuckets_(buckets) {}
+
+  std::pair<bool, Value> find(const Key &key) const {
+    Value value;
+    bool found = atomicLoad(key, value);
+    return std::make_pair(found, value);
+  }
+
+  void store(const Key &key, Value value) { atomicStore(key, value); }
+
+  const Stats stats() const {
+#ifdef ENABLE_CACHE_STATS
+    return Stats{hits_.load(), misses_.load()};
+#else
+    ABORT("Cache statistics requested without enabling in builds. Please use -DENABLE_CACHE_STATS with cmake.");
+    return Stats{0, 0};
+#endif
+  }
+
+ private:
+  using Record = std::pair<Key, Value>;
+
+  bool atomicLoad(const Key &key, Value &value) const {
+    // No probing, direct map onto records_
+    size_t index = hash_(key) % records_.size();
+    size_t mutexId = index % mutexBuckets_.size();
+
+    std::lock_guard<std::mutex> lock(mutexBuckets_[mutexId]);
+    const Record &candidate = records_[index];
+    if (equals_(key, candidate.first)) {
+      value = candidate.second;
+#ifdef ENABLE_CACHE_STATS
+      ++hits_;
+#endif
+      return true;
+    } else {
+#ifdef ENABLE_CACHE_STATS
+      ++misses_;
+#endif
+    }
+
+    return false;
+  }
+
+  void atomicStore(const Key &key, Value value) {
+    // No probing, direct map onto records_
+    size_t index = hash_(key) % records_.size();
+    size_t mutexId = index % mutexBuckets_.size();
+
+    std::lock_guard<std::mutex> lock(mutexBuckets_[mutexId]);
+    Record &candidate = records_[index];
+
+    candidate.first = key;
+    candidate.second = value;
+  }
+
+  std::vector<Record> records_;
+
+  mutable std::vector<std::mutex> mutexBuckets_;
+
+#ifdef ENABLE_CACHE_STATS
+  mutable std::atomic<size_t> hits_{0};
+  mutable std::atomic<size_t> misses_{0};
+#endif
+
+  Hash hash_;
+  Equals equals_;
+};
+
+typedef AtomicCache<size_t, Ptr<History>> TranslationCache;
+
+}  // namespace marian::bergamot
diff --git a/inference/src/translator/definitions.h b/inference/src/translator/definitions.h
new file mode 100644
index 000000000..efba3f9f6
--- /dev/null
+++ b/inference/src/translator/definitions.h
@@ -0,0 +1,78 @@
+#ifndef SRC_BERGAMOT_DEFINITIONS_H_
+#define SRC_BERGAMOT_DEFINITIONS_H_
+
+#include <vector>
+
+#include "aligned.h"
+#include "data/types.h"
+#include "data/vocab_base.h"
+
+namespace marian {
+namespace bergamot {
+
+typedef marian::Words Segment;
+typedef std::vector<Segment> Segments;
+
+/// Shortcut to AlignedVector<char> for byte arrays
+typedef AlignedVector<char> AlignedMemory;
+
+/// Memory bundle for all byte-arrays.
+/// Can be a set/subset of model, shortlist, vocabs and ssplitPrefixFile bytes.
+struct MemoryBundle {
+  std::vector<AlignedMemory> models{};  ///< Byte-array of model (each element is aligned to 256)
+  AlignedMemory shortlist{};            ///< Byte-array of shortlist (aligned to 64)
+
+  /// Vector of vocabulary memories (aligned to 64).
+  /// If two vocabularies are the same (based on the filenames), two entries (shared
+  /// pointers) will be generated which share the same AlignedMemory object.
+  std::vector<std::shared_ptr<AlignedMemory>> vocabs{};
+
+  /// @todo Not implemented yet
+  AlignedMemory ssplitPrefixFile{};
+
+  AlignedMemory qualityEstimatorMemory;  ///< Byte-array of qe model (aligned to 64)
+};
+
+/// ByteRange stores indices for half-interval [begin, end) in a string. Can be
+/// used to represent a sentence, word.
+struct ByteRange {
+  size_t begin;
+  size_t end;
+  const size_t size() const { return end - begin; }
+  bool operator==(ByteRange other) const { return begin == other.begin && end == other.end; }
+};
+
+/// A Subword range is mechanically the same as a `ByteRange`, but instead of
+/// describing a span of bytes, it describes a span of Subword tokens. Using
+/// `Annotation.word()` you can switch between the two.
+struct SubwordRange {
+  size_t begin;
+  size_t end;
+  const size_t size() const { return end - begin; }
+  bool operator==(SubwordRange other) const { return begin == other.begin && end == other.end; }
+};
+
+class Response;
+using CallbackType = std::function<void(Response &&)>;
+
+}  // namespace bergamot
+}  // namespace marian
+
+// @TODO at the moment the usage of string_view in this repository is a hot mess and a disaster waiting to happen.
+// ssplit uses std::string_view if the compiler supports c++17, else falls back to c++11 and absl::string_view
+// bergamot-translator uses, depending on the source file std::string_view (which will break if ssplit-cpp uses
+// absl::string_view) and marian::string_view which is an export of (confusingly) the sentencepiece module that
+// marian has. marian::string_view is our addition to the marian fork, which will make merging even nicer. Not.
+// This is just an ugly patchwork that allos gcc5, our lowest targetted gcc to run. We don't actually try to run
+// on older compilers.
+
+#if defined(__GNUC__) && __GNUC__ < 6 && !defined(__clang__)
+#include <experimental/string_view>
+namespace std {
+using string_view = std::experimental::string_view;
+}  // namespace std
+#else
+#include <string_view>
+#endif
+
+#endif  // SRC_BERGAMOT_DEFINITIONS_H_
diff --git a/inference/src/translator/html.cpp b/inference/src/translator/html.cpp
new file mode 100644
index 000000000..421074aa1
--- /dev/null
+++ b/inference/src/translator/html.cpp
@@ -0,0 +1,811 @@
+#include "html.h"
+
+#include <algorithm>
+
+#include "response.h"
+#include "translator/definitions.h"
+#include "xh_scanner.h"
+
+namespace {
+using marian::bergamot::AnnotatedText;
+using marian::bergamot::ByteRange;
+using marian::bergamot::HTML;
+using marian::bergamot::Response;
+
+/// Encodes the minimum of HTML entities.
+void encodeEntities(marian::string_view const &input, std::string &output) {
+  output.clear();
+  output.reserve(input.size());  // assumes there are no entities in most cases
+
+  for (char it : input) {
+    switch (it) {
+      case '&':
+        output.append("&amp;");
+        break;
+      case '<':
+        output.append("&lt;");
+        break;
+      case '>':
+        output.append("&gt;");
+        break;
+      // case ???:
+      //   output.append("&nbsp;");
+      //   break;
+      // case '"':
+      //   output.append("&quot;");
+      //   break;
+      // case '\'':
+      //   output.append("&apos;");
+      //   break;
+      default:
+        output.push_back(it);
+        break;
+    }
+  }
+}
+
+/// Counts number of whitespace characters at the start of the input. Used
+/// for determining where to insert an open or close tag.
+size_t countPrefixWhitespaces(marian::string_view const &input) {
+  size_t size = 0;
+  while (size < input.size() && std::isspace(static_cast<unsigned char>(input[size]))) ++size;
+  return size;
+}
+
+std::string toLowerCase(std::string_view const &input) {
+  std::string out;
+  out.resize(input.size());
+  std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
+  return out;
+}
+
+/// Very simple replacement for std::format introduced in C++20. Only supports
+/// replacing `{}` in the template string with whatever `operator<<` for that
+/// type turns it into.
+std::string format(std::string const &formatTemplate) { return formatTemplate; }
+
+template <typename Arg>
+std::string format(std::string const &formatTemplate, Arg arg) {
+  std::ostringstream os;
+  auto index = formatTemplate.find("{}");
+  assert(index != std::string::npos);
+  os << formatTemplate.substr(0, index) << arg << formatTemplate.substr(index + 2);
+  return os.str();
+}
+
+template <typename Arg, typename... Args>
+std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
+  std::ostringstream os;
+  auto index = formatTemplate.find("{}");
+  assert(index != std::string::npos);
+  os << formatTemplate.substr(0, index) << arg << format(formatTemplate.substr(index + 2), std::forward<Args>(args)...);
+  return os.str();
+}
+
+/// Syntactic sugar around rbegin() and rend() that allows me to write
+/// `for (auto &&item : reversed(container))` instead of the needlessly verbose
+/// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
+template <typename T>
+class Reversed {
+ public:
+  using iterator = typename T::const_reverse_iterator;
+  explicit Reversed(T const &container) : container_(container){};
+  iterator begin() const { return container_.rbegin(); }
+  iterator end() const { return container_.rend(); }
+
+ private:
+  T const &container_;
+};
+
+/// When comparing two tag stacks, determine which tags need to be closed and
+/// opened to get from one stack to the other.
+void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagStack &opening,
+              HTML::TagStack &closing) {
+  opening.clear();
+  closing.clear();
+
+  size_t i = 0;
+
+  // Find first difference
+  for (; i < prev.size(); ++i)
+    if (i >= curr.size() || prev[i] != curr[i]) break;
+
+  // Only nodes of type ELEMENT can have children and thus would need a closing tag.
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
+  std::copy_if(prev.begin() + i, prev.end(), std::back_inserter(closing),
+               [&](HTML::Tag *tag) { return tag->type == HTML::Tag::ELEMENT; });
+
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
+  opening.insert(opening.end(), curr.begin() + i, curr.end());
+}
+
+bool intersects(ByteRange const &range, HTML::Span const &span) {
+  return range.begin <= span.end && range.end >= span.begin;
+};
+
+bool contains(HTML::TagNameSet const &set, std::string_view const &name) { return set.find(name) != set.end(); }
+
+bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) {
+  return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
+}
+
+/// Is tag stack B an extended version of A? I.e. same tags, but maybe a few
+/// more nested deeper.
+bool extends(HTML::TagStack const &b, HTML::TagStack const &a) {
+  if (a.size() > b.size()) return false;
+
+  for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
+    if (*i != *j) return false;
+
+  return true;
+}
+
+/// Tests whether `response` has alignment info associated with it or not.
+bool hasAlignments(Response const &response) {
+  // Test for each sentence individually as a sentence may be empty (or there)
+  // might be no sentences, so just testing for alignments.empty() would not be
+  // sufficient.
+  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
+    // If response.alignments is just empty, this might catch it.
+    if (response.alignments.size() <= sentenceIdx ||
+        response.alignments[sentenceIdx].size() != response.target.numWords(sentenceIdx))
+      return false;
+
+    // If response.alignments is "empty" because the model did not provide alignments,
+    // it still has entries for each target word. But all these entries are empty.
+    for (size_t wordIdx = 0; wordIdx < response.target.numWords(sentenceIdx); ++wordIdx)
+      if (response.alignments[sentenceIdx][wordIdx].size() != response.source.numWords(sentenceIdx)) return false;
+  }
+  return true;
+}
+
+/// Helper class to append HTML tags to a token. Also makes sure the token is
+/// encoded as valid HTML.
+class TokenFormatter {
+ public:
+  explicit TokenFormatter(marian::string_view token)
+      : offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
+    // Do encoding of any entities that popped up in the translation
+    encodeEntities(token, html_);
+  }
+
+  std::string &&html() { return std::move(html_); }
+
+  // Append the markup necessary for moving from `prev` set of tags to `curr`.
+  void append(HTML::TagStack const &prev, HTML::TagStack const &curr) {
+    HTML::TagStack opening, closing;
+
+    diffTags(prev, curr, opening, closing);
+
+    for (HTML::Tag const *tag : Reversed(closing)) {
+      assert(tag->type == HTML::Tag::ELEMENT);
+      std::string closeTag = format("</{}>", tag->name);
+      html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
+      offset_ += closeTag.size();
+      if (closeLeft_) whitespaceOffset_ += closeTag.size();
+    }
+
+    for (HTML::Tag const *tag : opening) {
+      std::string openTag;
+      switch (tag->type) {
+        case HTML::Tag::ELEMENT:
+        case HTML::Tag::VOID_ELEMENT:
+          openTag = format("<{}{}>{}", tag->name, tag->attributes, tag->data);
+          break;
+        case HTML::Tag::COMMENT:
+          openTag = format("<!--{}-->", tag->data);
+          break;
+        case HTML::Tag::PROCESSING_INSTRUCTION:
+          openTag = format("<?{}?>", tag->data);
+          break;
+        case HTML::Tag::WHITESPACE: {
+          // Try to eat two newlines (paragraph break) from our segment
+          auto pos = html_.find("\n\n", whitespaceOffset_);
+          if (pos != std::string::npos && pos < whitespaceOffset_ + whitespaceSize_) {
+            html_.erase(pos, 2);
+            whitespaceSize_ -= 2;
+          }
+        } break;
+      }
+
+      html_.insert(offset_ + whitespaceSize_, openTag);
+      offset_ += openTag.size();
+      closeLeft_ = closeLeft_ && openTag.empty();
+    }
+  }
+
+ private:
+  std::string html_;         // Output html
+  size_t offset_;            // Size added by prepending HTML
+  size_t whitespaceOffset_;  // position of prefix whitespace characters
+                             // (it moves as closing tags are prepended)
+  size_t whitespaceSize_;    // number of prefix whitespace characters
+
+  // Close tags we want to show up left (before) the token, but open tags
+  // ideally come directly after any prefix whitespace. However, some tokens
+  // match multiple spans. If a previous span has added an open tag, after any
+  // whitespace, and the next span closes said tag again, we need to close
+  // it after the whitespace. So after the first open tag, any closing tag
+  // should also align right, after whitespace, not before. Hence this bool.
+  bool closeLeft_;
+};
+
+/// Count the number of tokens in an AnnotatedText. Used to assert we're not
+/// running out of sync when creating vectors that describe each token.
+size_t debugCountTokens(AnnotatedText const &text) {
+  size_t tokens = 1;  // for the ending gap
+  for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
+    tokens += 1 + text.numWords(sentenceIdx);  // pre-sentence prefix/gap + each word
+  }
+  return tokens;
+}
+
+/// Helper function that consumes a tag as if it is a special tag, except that
+/// it takes nesting into account. I.e. `<a><a></a></a>` will be consumed to the
+// last `</a>`. Assumes TT_TAG_START is already consumed, which was necessary
+/// to determine whether this was an element that needed to be ignored.
+void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
+  // Only full elements can be consumed this way. With void tags we don't know
+  // where to stop scanning. All other types cannot be nested anyway.
+  assert(tag.type == HTML::Tag::ELEMENT);
+
+  // TT_TAG_START is already consumed.
+  markup::Scanner::TokenType token;
+  size_t inside = 0;
+
+  // Consume the full open tag, i.e. all its attributes
+  while (!inside) {
+    token = scanner.next();
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        ABORT("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        ABORT("Did not find closing tag </{}>", name);
+      case markup::Scanner::TT_ATTRIBUTE:
+        tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
+        break;
+      default:
+        // Not an attribute! Must be something inside the body or the closing
+        // tag already. Time to jump to the next loop.
+        ++inside;
+        break;
+    }
+  }
+
+  // Last token was something that would have triggered Scanner::scanBody(),
+  // which sets value() to start pointing at the body.
+  const char *start = scanner.start();
+
+  // Consume the rest of the HTML until (including) the final closing tag. We
+  // start with the token that caused the previous loop to fall into the default
+  // case.
+  while (inside) {
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        ABORT("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        ABORT("Did not find closing tag </{}>");
+      case markup::Scanner::TT_TAG_START:
+        // Note: Looking specifically for only our own type of tag so we don't
+        // have to care about whether other tags we encounter are void tags or
+        // not. Does assume the HTML is valid, as no stack is kept.
+        if (toLowerCase(scanner.tag()) == name) ++inside;
+        break;
+      case markup::Scanner::TT_TAG_END:
+        if (toLowerCase(scanner.tag()) == name) --inside;
+        break;
+      default:
+        break;
+    }
+
+    // Only continue scanning if we're still inside. We could have just read the
+    // TT_TAG_END token that ended this element, and we don't want to continue
+    // consuming tokens at that point.
+    if (inside) token = scanner.next();
+  }
+
+  // Only a TAG_END could have stopped the previous loop. We take the start
+  // of the final closing tag as the end of our data.
+  assert(token == markup::Scanner::TT_TAG_END);
+  const char *end = scanner.start();
+
+  // All data between the end of the first open element, and the start of the
+  // last close element, we just treat as raw data that will be printed when
+  // this tag is eventually printed.
+  assert(end >= start);
+  tag.data = std::string_view(start, end - start);
+}
+
+}  // namespace
+
+namespace marian::bergamot {
+
+/// Formatters used for formatting error messages in ABORT() calls.
+std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
+  if (tag == nullptr) return out << "[nullptr]";
+  switch (tag->type) {
+    case HTML::Tag::ELEMENT:
+      return out << '<' << tag->name << tag->attributes << '>';
+    case HTML::Tag::VOID_ELEMENT:
+      return out << '<' << tag->name << tag->attributes << "/>";
+    case HTML::Tag::COMMENT:
+      return out << "<!--" << tag->data << "-->";
+    case HTML::Tag::PROCESSING_INSTRUCTION:
+      return out << "<?" << tag->data << "?>";
+    case HTML::Tag::WHITESPACE:
+      return out << "[inserted space]";
+  }
+  return out << "[Unknown tag type]";
+}
+
+std::ostream &operator<<(std::ostream &out, HTML::TagStack const &tags) {
+  for (auto it = tags.begin(); it != tags.end(); ++it) {
+    if (it != tags.begin()) out << ' ';
+    out << *it;
+  }
+  return out;
+}
+
+HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : options_(std::move(options)) {
+  if (!processMarkup) return;
+
+  std::string original = std::move(source);
+  markup::instream in(original.data(), original.data() + original.size());
+  markup::Scanner scanner(in);
+  source.clear();  // source is moved out of, so should be clear anyway
+
+  Tag *tag = nullptr;             // current tag (after opening at least)
+  TagStack stack;                 // stack of currently open tags
+  bool addSentenceBreak = false;  // whether to add a sentence break next text segment
+  bool addWordBreak = false;      // whether to add a word break next text segment
+
+  // Starting point: an empty span with no open tags.
+  spans_.push_back(Span{0, 0, {}});
+
+  bool stop = false;
+  while (!stop) {
+    switch (scanner.next()) {
+      case markup::Scanner::TT_ERROR:
+        ABORT("HTML parse error");
+
+      case markup::Scanner::TT_EOF:
+        stop = true;
+        break;
+
+      case markup::Scanner::TT_TEXT: {
+        // If the previous segment was the open or close tag of a block element
+        // we treat the text after it as a new sentence.
+        if (addSentenceBreak) {
+          // If there isn't already a \n\n at the end of source...
+          if (source.size() >= 2 && source.substr(source.size() - 2) != "\n\n") {
+            stack.push_back(makeTag({Tag::WHITESPACE}));
+            // Important: span->size() == 0 to make it behave as a void element.
+            // Also important: position before the \n\n tokens, not after, to
+            // make it easier to remove them later through apply().
+            spans_.push_back(Span{source.size(), source.size(), stack});
+            source.append("\n\n");  // Should work with ssplit-mode = wrapped_text
+            stack.pop_back();
+          }
+          addSentenceBreak = false;
+        }
+
+        // If the previous segment was an open or close tag, it might be best
+        // to add a space to make sure we don't append to the previous word.
+        if (addWordBreak) {
+          // Only add the space when it would be inside a word. Do not add it if
+          // it would be between a word and punctuation.
+          if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) {
+            source.push_back(' ');
+          }
+          addWordBreak = false;
+        }
+
+        // Store which tags were open when this span of text was encountered.
+        auto begin = source.size();
+        source.append(scanner.value());
+        spans_.push_back(Span{begin, source.size(), stack});
+      } break;
+
+      case markup::Scanner::TT_TAG_START: {
+        std::string name = toLowerCase(scanner.tag());
+
+        // Tag *tag is used by attribute parsing
+        auto type = contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT;
+        tag = makeTag({type, std::string(scanner.tag())});
+
+        stack.push_back(tag);
+
+        // Empty elements (e.g. <img>) are not applicable to a span of text
+        // so instead we "apply" them to an empty span in between, and then
+        // immediately remove them again from the stack.
+        if (tag->type == Tag::VOID_ELEMENT) {
+          spans_.push_back(Span{source.size(), source.size(), stack});
+          stack.pop_back();
+        }
+
+        // Ignored tags have same semantics as void tags with regards to moving
+        // them around with the rest of the content.
+        if (contains(options_.ignoredTags, name)) {
+          consumeIgnoredTag(scanner, *tag, name);
+          spans_.push_back(Span{source.size(), source.size(), stack});
+          stack.pop_back();
+        }
+
+        // Treat non-inline HTML tags as spaces that break up words.
+        if (!contains(options_.inlineTags, name)) {
+          addSentenceBreak = true;
+        } else if (!contains(options_.inWordTags, name)) {
+          addWordBreak = true;
+        }
+      } break;
+
+      case markup::Scanner::TT_TAG_END: {
+        std::string tagName = toLowerCase(scanner.tag());
+        // If this is the closing bit of a void tag, i.e. triggered by the "/>"
+        // bit of "<img/>", then completely ignore it.
+        if (contains(options_.voidTags, tagName)) break;
+
+        ABORT_IF(stack.empty(), "Encountered more closing tags ({}) than opening tags", scanner.tag());
+
+        ABORT_IF(toLowerCase(stack.back()->name) != toLowerCase(scanner.tag()),
+                 "Encountered unexpected closing tag </{}>, stack is {}", scanner.tag(), stack);
+
+        // What to do with "<u></u>" case, where tag is immediately closed
+        // so it never makes it into the taint of any of the spans? This adds
+        // an empty span so it still gets recorded in spans_.
+        if (spans_.empty() || !contains(spans_.back().tags, stack.back()))
+          spans_.push_back(Span{source.size(), source.size(), stack});
+
+        stack.pop_back();
+
+        // Add space if necessary
+        if (!contains(options_.inlineTags, tagName)) {
+          addSentenceBreak = true;
+        } else if (!contains(options_.inWordTags, tagName)) {
+          addWordBreak = true;
+        }
+      } break;
+
+      case markup::Scanner::TT_ATTRIBUTE:
+        assert(tag != nullptr);
+        tag->attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
+        break;
+
+      case markup::Scanner::TT_COMMENT_START:
+        // Tag *tag is used when TT_DATA is seen to add the comment's content.
+        tag = makeTag({Tag::COMMENT});
+        stack.push_back(tag);
+        spans_.push_back(Span{source.size(), source.size(), stack});
+        stack.pop_back();
+        break;
+
+      case markup::Scanner::TT_PROCESSING_INSTRUCTION_START:
+        // Tag *tag is used when TT_DATA is seen to add the PI's content.
+        tag = makeTag({Tag::PROCESSING_INSTRUCTION});
+        stack.push_back(tag);
+        spans_.push_back(Span{source.size(), source.size(), stack});
+        stack.pop_back();
+        break;
+
+      case markup::Scanner::TT_COMMENT_END:
+      case markup::Scanner::TT_PROCESSING_INSTRUCTION_END:
+        tag = nullptr;
+        break;
+
+      case markup::Scanner::TT_DATA:
+        assert(tag != nullptr);
+        tag->data = scanner.value();
+        break;
+
+      default:
+        ABORT("Unsupported scanner token type");
+    }
+  }
+
+  ABORT_IF(!stack.empty(), "Not all tags were closed: {}", stack);
+
+  // Add a trailing span (that's empty) to signify all closed tags.
+  spans_.emplace_back(Span{source.size(), source.size(), stack});
+}
+
+void HTML::restore(Response &response) {
+  // No-op if process_markup was false (and thus spans_ is empty)
+  // TODO: replace this with optional<HTML> at a higher level
+  if (spans_.empty()) return;
+
+  // We need alignment info to transfer the HTML tags from the input to the
+  // translation. If those are not available, no HTML in translations for you.
+  ABORT_UNLESS(hasAlignments(response),
+               "Response object does not contain alignments. TranslationModel or ResponseOptions is misconfigured?");
+
+  // Reconstruction of HTML tags:
+  // 1. Map each token to a Span
+  // 2. Reconstruct the source HTML with these tainted tokens
+  // 3. Transfer the spans from the source tokens to the target tokens using alignment information
+  // 4. For spans that represent empty elements (e.g. <img>) figure out their position
+  // 5. Reconstruct the target HTML with these tainted tokens
+
+  // sourceTokenSpans is a vector with a pointer to a span for each token. We
+  // use iterators here to point to these positions so we can easily compare if
+  // one span comes before or after another, information we'll need when we need
+  // to figure out whether we've skipped spans (of emtpy elements) when
+  // reconstructing HTML in response.target.
+  std::vector<SpanIterator> sourceTokenSpans;
+
+  // RestoreSource re-inserts HTML into the source text, but also identifies
+  // which span each source token fits into best.
+  AnnotatedText source = restoreSource(response.source, sourceTokenSpans);
+  assert(sourceTokenSpans.size() == debugCountTokens(response.source));
+
+  // Find for every token in target the token in source that best matches.
+  std::vector<std::vector<size_t>> alignments;
+  hardAlignments(response, alignments, sourceTokenSpans);
+
+  std::vector<SpanIterator> targetTokenSpans;
+  copyTagStack(response, alignments, sourceTokenSpans, targetTokenSpans);
+  assert(targetTokenSpans.size() == debugCountTokens(response.target));
+
+  // Take the spans, and use them to make a taint for every word in the
+  // translation. Optionally add extra tags, like quality score metadata.
+  std::vector<HTML::TagStack> targetTokenTags;
+  annotateTagStack(response, targetTokenSpans, targetTokenTags);
+
+  AnnotatedText target = restoreTarget(response.target, targetTokenSpans, targetTokenTags);
+
+  response.source = source;
+  response.target = target;
+}
+
+AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans) {
+  auto spanIt = spans_.begin();
+  auto prevIt = spans_.begin();  // safe because first span is always empty span, and
+                                 // and the while-loop below will do the rest
+  assert(prevIt == spans_.end() || prevIt->tags.empty());
+
+  return in.apply([&](ByteRange range, string_view token, bool last) {
+    TokenFormatter formatter(token);
+
+    // Potential issue: spans and tokens can intersect, e.g.
+    //
+    //    text  <p> h <u> e </u> ll o </p>
+    //   spans     |1|   |2|    |3333| (so only 2 is tainted with <p><u>, others only <p>)
+    //  tokens     |111111111111111|2|
+    //
+    // Now 1 covers span 1 to 3, so what taint should it get? Just `<p>`, or
+    // `<p><u>`?
+    // Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
+    // just insert spaces around all elements, every segment of `hello` will be
+    // a token.
+
+    // Seek to the last span that overlaps with this token
+    while (true) {
+      formatter.append(prevIt->tags, spanIt->tags);
+      prevIt = spanIt;
+
+      if (spanIt + 1 != spans_.end() && ((spanIt + 1)->begin < range.end || last)) {
+        spanIt++;
+        continue;
+      }
+
+      break;
+    }
+
+    // TODO: This is just the taint of the last span, not the ones in between.
+    // This makes us lose some markup of parts of tokens as described above.
+    sourceTokenSpans.emplace_back(prevIt);
+
+    return std::move(formatter.html());
+  });
+}
+
+AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans,
+                                  std::vector<TagStack> const &targetTokenTags) {
+  auto prevTags = spans_.cbegin()->tags;
+  auto stragglerSpanIt = spans_.cbegin();
+  auto targetSpanIt = targetTokenSpans.begin();
+  auto targetTagIt = targetTokenTags.begin();
+
+  AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
+    TokenFormatter formatter(token);
+
+    // First we scan through spans_ to catch up to the span assigned to this
+    // token. We're only interested in empty spans (empty and void elements)
+    for (; stragglerSpanIt < *targetSpanIt; stragglerSpanIt++) {
+      // We're only interested in empty spans or spans that would otherwise get
+      // lost because they didn't align with anything between the spans in
+      // targetSpanIt
+      // TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
+      if (stragglerSpanIt->size() != 0 &&
+          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), stragglerSpanIt) != targetTokenSpans.end())
+        continue;
+
+      formatter.append(prevTags, stragglerSpanIt->tags);
+      prevTags = stragglerSpanIt->tags;
+    }
+
+    // Now do the same thing but for our target set of tags. Note that we cannot
+    // combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
+    // because there is no guarantee that the order in `targetTokenSpans` is
+    // the same as that of `spans`.
+
+    formatter.append(prevTags, *targetTagIt);
+
+    // If this is the last token of the response, close all open tags.
+    if (last) {
+      // Note: this assert is true due to our current implementation of
+      // HardAlignments() that always matches the last token of the input with
+      // the last token of the output. But lets assume someone someday changes
+      // HardAlignments(), and then this for-loop will be necessary.
+      // assert((*targetSpanIt)->tags.empty());
+      formatter.append(*targetTagIt, HTML::TagStack());
+    }
+
+    prevTags = *targetTagIt;
+    ++targetSpanIt;
+    ++targetTagIt;
+
+    return std::move(formatter.html());
+  });
+
+  // Assert that we did in fact use all our taints
+  assert(targetSpanIt == targetTokenSpans.end());
+
+  return out;
+}
+
+HTML::Tag *HTML::makeTag(Tag &&tag) {
+  pool_.emplace_front(std::move(tag));
+  return &pool_.front();
+}
+
+void HTML::copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                        std::vector<SpanIterator> const &sourceTokenSpans,
+                        std::vector<SpanIterator> &targetTokenSpans) {
+  size_t offset = 0;  // Sentence offset in sourceTokenSpans
+
+  // Fill targetTokenSpans based on the alignments we just made up.
+  // NOTE: this should match the exact order of Apply()
+  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
+    targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for sentence ending gap
+    for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
+      size_t s = alignments[sentenceIdx][t];
+      assert(s < response.source.numWords(sentenceIdx));
+      targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]);  // +1 for prefix gap
+    }
+
+    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
+  }
+
+  assert(offset + 1 == sourceTokenSpans.size());
+  targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for ending whitespace
+}
+
+void HTML::annotateTagStack(Response const &response, std::vector<SpanIterator> const &targetTokenSpans,
+                            std::vector<HTML::TagStack> &targetTokenTags) {
+  auto spanIt = targetTokenSpans.begin();
+  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
+    // Sentence prefix
+    targetTokenTags.push_back((*spanIt)->tags);
+    spanIt++;
+
+    // Offset in targetTokenTags at which this sentence's tags start.
+    size_t tagOffset = targetTokenTags.size();
+
+    // Initially, just copy the span's tags to this token
+    for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
+      targetTokenTags.emplace_back((*spanIt)->tags);
+      spanIt++;
+    }
+
+    // If we have quality score information, add that as metadata as well.
+    if (!response.qualityScores.empty()) {
+      auto const &sentenceQuality = response.qualityScores[sentenceIdx];
+      // Create a single <font> tag for this sentence with sentence level info
+      Tag *sentenceTag = makeTag({Tag::ELEMENT, "font"});
+      sentenceTag->attributes += format(" x-bergamot-sentence-index=\"{}\" x-bergamot-sentence-score=\"{}\"",
+                                        sentenceIdx, sentenceQuality.sentenceScore);
+
+      // Add that tag to all tokens in this sentence.
+      for (size_t tokenIdx = 0; tokenIdx < response.target.numWords(sentenceIdx); ++tokenIdx) {
+        targetTokenTags[tagOffset + tokenIdx].push_back(sentenceTag);
+      }
+
+      // Add word level <font> tags as well to all tokens that make up a word.
+      for (size_t wordIdx = 0; wordIdx < sentenceQuality.wordRanges.size(); ++wordIdx) {
+        Tag *wordTag = makeTag({Tag::ELEMENT, "font"});
+        wordTag->attributes += format(" x-bergamot-word-index=\"{}\" x-bergamot-word-score=\"{}\"", wordIdx,
+                                      sentenceQuality.wordScores[wordIdx]);
+        auto const &range = sentenceQuality.wordRanges[wordIdx];
+        for (size_t tokenIdx = range.begin; tokenIdx < range.end; ++tokenIdx) {
+          targetTokenTags[tagOffset + tokenIdx].push_back(wordTag);
+        }
+      }
+    }
+  }
+
+  // Suffix
+  targetTokenTags.push_back((*spanIt)->tags);
+  spanIt++;
+
+  assert(spanIt == targetTokenSpans.end());
+}
+
+// Reports if token `str` is likely to be a continuation of a word. This is used
+// to determine whether we should share the markup, or whether we should see
+// this token as a fresh start. This implementation will treat "hello[world]"
+// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
+bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
+  if (options_.continuationDelimiters.empty()) return false;
+  if (prev.empty() || str.empty()) return false;
+  return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
+         options_.continuationDelimiters.find(prev.back()) == std::string::npos;
+}
+
+bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
+  return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
+}
+
+/// Selects for each token in `response.target` a best source token from
+/// `response.source` and writes this selection to `alignments`. The source
+/// token spans are used to also look at the markup applied to each token to
+/// figure out which source token best represents each target token.
+void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                          std::vector<SpanIterator> const &sourceTokenSpans) {
+  size_t offset = 0;  // sentence offset in sourceTokenSpans
+
+  // For each sentence...
+  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
+    alignments.emplace_back();
+
+    // Hard-align: find for each target token the most prevalent source token
+    // Note: only search from 0 to N-1 because token N is end-of-sentence token
+    // that can only align with the end-of-sentence token of the target
+    for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
+      alignments.back().push_back(
+          std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
+          response.alignments[sentenceIdx][t].begin());
+    }
+
+    // Next, we try to smooth out these selected alignments with a few heuristics
+    for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
+      // If this token is a continuation of a previous token, pick the tags from the most
+      // prevalent token for the whole word.
+      if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
+        // Note: only looking at the previous token since that will already
+        // have this treatment applied to it.
+        size_t currSentenceIdx = alignments.back()[t];
+        size_t prevSentenceIdx = alignments.back()[t - 1];
+        float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
+        float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
+
+        TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
+        TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
+
+        // If this token has more markup, or a better score than the previous
+        // token (and they together are part of a word-ish thing) then mark
+        // this word as aligning. Otherwise just copy the alignment source of
+        // the previous token.
+        if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
+          // Apply this to all previous tokens in the word
+          for (size_t i = t;; --i) {
+            alignments.back()[i] = currSentenceIdx;
+
+            // Stop if this was the first token or the beginning of the word
+            if (i == 0 ||
+                !isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
+              break;
+          }
+        } else {
+          alignments.back()[t] = prevSentenceIdx;
+        }
+      }
+    }
+
+    // Always align target end with source end
+    alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
+
+    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
+  }
+}
+
+}  // namespace marian::bergamot
diff --git a/inference/src/translator/html.h b/inference/src/translator/html.h
new file mode 100644
index 000000000..f3c6dad19
--- /dev/null
+++ b/inference/src/translator/html.h
@@ -0,0 +1,224 @@
+#ifndef SRC_BERGAMOT_HTML_H_
+#define SRC_BERGAMOT_HTML_H_
+
+#include <forward_list>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+
+#include "annotation.h"
+#include "data/types.h"
+#include "definitions.h"
+
+namespace marian::bergamot {
+
+struct Response;
+
+/// HTML class parses and removes HTML from input text, and places it back into
+/// the translated output text.
+///
+/// When parsing the HTML, it treats tags as markup, where a list of nested tags
+/// can be seen as a list of markups that are applicable to all the text that
+/// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or
+/// closes, a new TagStack is created to reflect that. TagStack used to be
+/// called `Taint` because it *tainted* the text it was associated with with
+/// those tags as markup. The text between tags themselves is stored in the
+/// input variable. In `spans_`, the TagStack that is associated with a
+/// substring of that text is stored.
+/// When transferring the HTML from the source text to the translated target
+/// text, the TagStacks are first associated with each of the subwords from the
+/// source text. Using hard alignment, each subword in the source text is linked
+/// to a subword in the target text. The TagStacks are then copied over these
+/// links. Finally, the HTML is inserted back into the target text by for each
+/// subword, comparing the TagStack from the previous word to that word, and
+/// opening and closing elements to make up for the difference.
+///
+/// There are a couple of complexities though:
+/// 1. Not all tags can be treated as markup applied to text. For example, an
+///    `<img>` does not contain text itself. Or `<i></i>` does not. We do want
+///    those tags to remain in the output though. We do this by associating
+///    them to an empty `Span`. When inserting HTML back into the translation
+///    input or output, we keep track of where in the `spans_` vector we are,
+///    and insert any elements from empty spans that we might have skipped over
+///    because empty spans are never linked to tokens/subwords. These are
+///    *stragglers* in some parts of the code, or *void* or *empty* elements in
+///    other parts.
+/// 2. Some tags should be treated as paragraph indicators, and break up
+///    sentences. These are the usual suspects like `<p>`, but also `<li>` and
+///    `<td>`, to make sure we don't translate two table cells into a single
+///    word. This is the `addSentenceBreak` flag in the HTML parsing bit.
+///    We mark these breaks with `\n\n` in the input text and with a special
+///    WHITESPACE tag that we treat as any other void tag. Hopefully this tag
+///    moves with the added `\n\n` and it is easy for us to remove it again.
+///    (in practise it is since these only occur at the end of sentences and
+///    the end of sentences are always aligned between source and target.)
+/// 3. We treat most tags as word-breaking. We do this by adding spaces just
+///    after where we saw the open or close tag occur. If there is already
+///    some whitespace in that place, we do not add extra spaces.
+/// 4. TODO
+class HTML {
+ public:
+  using TagNameSet = std::set<std::string, std::less<>>;
+
+  /// Options struct that controls how HTML is interpreted.
+  struct Options {
+    /// List of elements for which we do not expect a closing tag, or
+    /// self-closing elements in XHTML. We do not need to see a closing tag
+    /// for these elements, and they cannot contain text or tags themselves.
+    /// See also:
+    /// https://developer.mozilla.org/en-US/docs/Glossary/Empty_element.
+    /// More relevant source of this list:
+    /// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
+    TagNameSet voidTags{"area", "base",  "basefont", "bgsound", "br",   "col",   "embed",  "frame", "hr",
+                        "img",  "input", "keygen",   "link",    "meta", "param", "source", "track", "wbr"};
+
+    /// List of elements that are treated as inline, meaning they do not break
+    /// up sentences. Any element *not* in this list will cause the text that
+    /// follows its open or close tag to be treated as a separate sentence.
+    TagNameSet inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
+                          "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
+                          "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
+
+    /// List of elements that are, regardless of `substituteInlineTagsWithSpaces`,
+    /// not substituted with spaces. Technically almost all inline elements
+    /// should be treated like this, except `<br>` maybe, But in practice it
+    /// seems to be more effective to limit this set to just that one tag that
+    /// that can only really be used *inside* words: `<wbr>`.
+    /// See also: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
+    TagNameSet inWordTags{"wbr"};
+
+    /// List of elements we copy as is, but do parse as if they're HTML because
+    /// they could be nested. For <script> we just scan for </script> because
+    /// the script tag may not be nested, but that is not the case for these
+    /// elements per se. Some tags, like <script>, are ignored at the `Scanner`
+    /// level. See `xh_scanner.cpp/Scanner::scanAttribute()`.
+    TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
+
+    /// List of characters that occur at the start of a token that indicate that
+    /// the this token is probably *not* a continuation of a word. This is also
+    /// used to determine whether there should be a space after a closing tag
+    /// or not. I.e. a `.` after a `</strong>` does not need to be separated by
+    /// an extra space.
+    std::string continuationDelimiters = "\n ,.(){}[]";
+
+    /// Should we always add spaces to the places where tags used to be? I.e.
+    /// `un<u>der</u>line` should become `un der line`? This does help with
+    /// retaining tags inside words, or with odd pages that use CSS to add
+    /// spacing between a lot of tags. Cases like `<td>` and `<li>` are already
+    /// covered by treating them as sentence splitting.
+    bool substituteInlineTagsWithSpaces = true;
+  };
+
+  /// Represents a tag, or markup that is being applied to a string of text.
+  /// We treat all elements except `ELEMENT` as void elements or empty elements.
+  struct Tag {
+    enum NodeType {
+      ELEMENT,                 // <b>...</b>
+      VOID_ELEMENT,            // <img>
+      COMMENT,                 // <!-- ... -->
+      PROCESSING_INSTRUCTION,  // <?...?>
+      WHITESPACE,              // A \n\n we inserted to break a sentence.
+    };
+
+    NodeType type;           // Type of the node
+    std::string name;        // Tag name (if type is ELEMENT or VOID_ELEMENT)
+    std::string attributes;  // Tag attributes (as raw HTML string, including
+                             // entities and prefix whitespace)
+    std::string data;        // Raw data of an element that just needs to be
+                             // copied as is, e.g. <script> or <style>
+  };
+
+  /// Representation of markup that is being applied to a string of text. Order
+  /// matters as this represents how the tags are nested. The `Tag` objects
+  /// themselves are owned by `pool_`.
+  using TagStack = std::vector<Tag *>;
+
+  /// Span of text, with which a `TagStack` is associated. A span may be empty,
+  /// for example to represent the presence of an empty or VOID element.
+  struct Span {
+    size_t begin;   // Start offset in (plain text) source
+    size_t end;     // end offset in source
+    TagStack tags;  // Note: free pointers to memory owned by `pool_`.
+    inline size_t size() const { return end - begin; }
+  };
+
+  /// Parses HTML in `source` (if `processMarkup` is true). `source` is updated
+  /// to only contain the plain text extracted from the HTML. `HTML` instance
+  /// retains information about what tags are extracted from where to later
+  /// reconstruct the HTML in a `Response` object (both `source` and `target`).
+  explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
+  explicit HTML(std::string &&source, bool processMarkup, Options &&options);
+
+  /// It is not save to copy a HTML instance.
+  HTML(const HTML &) = delete;
+
+  /// Moving is fine
+  HTML(HTML &&) = default;
+
+  /// Reconstructs (not perfectly) the HTML as it was parsed from `source`,
+  /// and uses alignment information to also reconstruct the same markup in
+  /// `response.target`.
+  void restore(Response &response);
+
+ private:
+  using SpanIterator = std::vector<HTML::Span>::iterator;
+  using AnnotatedText = marian::bergamot::AnnotatedText;
+
+  /// Reconstructs HTML in `response.source` (passed as `in`) and makes a list
+  /// `sourceTokenSpans` that associates a `Span` with each subword in `in`.
+  /// We later use these span pointers to copy tags. They're iterators (or
+  /// pointers into a list) to be able to compare whether one span came before
+  /// or after another span.
+  AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
+
+  /// Inserts the HTML into `response.target` (passed as `in`) based on
+  /// `targetTokenSpans`, which points to a `Span` for each token (subword) in
+  /// `response.target`.
+  AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans,
+                              std::vector<HTML::TagStack> const &targetTokenTags);
+
+  /// Utilities to test whether subword `str` is part of a word together with
+  /// the subword `prev`, or a separate word. Basically *does `str` start with
+  /// a space, but bit more complex to deal with punctuation.
+  bool isContinuation(marian::string_view prev, marian::string_view str) const;
+  bool isContinuation(std::string_view prev, std::string_view str) const;
+
+  /// Copies span pointers from the subwords/tokens from the source text to the
+  /// subwords of the target text in `targetTokenSpans` using alignment
+  /// information in `response`.
+  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
+                    std::vector<HTML::SpanIterator> &targetTokenSpans);
+
+  void annotateTagStack(Response const &response, std::vector<SpanIterator> const &targetTokenSpans,
+                        std::vector<HTML::TagStack> &targetTokenTags);
+
+  /// Turns the alignment scores in `response.alignments` into one source token
+  /// per target token. Has some heuristics to keep all target tokens of a
+  /// single word pointing to the same span, and prefers spans with more markup
+  /// over spans with less to try to retain as much of the input markup as
+  /// possible.
+  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);
+
+  /// Allocates a tag in `pool_` (which then owns it) and gives a pointer to be
+  /// used in TagStacks. Pointer is valid as long as this HTML instance lives on.
+  Tag *makeTag(Tag &&tag);
+
+  /// HTML options associated with this parse.
+  Options options_;
+
+  /// List of spans of text in plain text `source`, and which tags are applied
+  /// to them.
+  std::vector<Span> spans_;
+
+  /// A pool of tags. `std::forward_list` because we do not want pointers to it
+  /// to be invalidated when new tags are allocated. This way it is easy to
+  /// deallocate them all when `HTML` goes out of scope.
+  std::forward_list<Tag> pool_;
+};
+
+}  // namespace marian::bergamot
+
+#endif  // SRC_BERGAMOT_HTML_H_
diff --git a/inference/src/translator/logging.h b/inference/src/translator/logging.h
new file mode 100644
index 000000000..704492283
--- /dev/null
+++ b/inference/src/translator/logging.h
@@ -0,0 +1,71 @@
+#include "3rd_party/browsermt-marian-dev/src/3rd_party/spdlog/spdlog.h"
+#include "common/logging.h"
+
+namespace marian {
+namespace bergamot {
+
+// RAII Wrap around logging, to clean up after the object on stack.
+class Logger {
+ public:
+  struct Config {
+    std::string level{"off"};
+    template <class App>
+    static void addOptions(App &app, Config &config) {
+      app.add_option("--log-level", config.level,
+                     "Set verbosity level of logging: trace, debug, info, warn, err(or), critical, off");
+    }
+  };
+
+  Logger(const Config &config) : marianLoggers_(createLoggers()) {
+    // We are manually creating loggers, because this is usually created in marian as a side-effect of
+    // config-parsing.
+    for (auto &logger : marianLoggers_) {
+      setLoggingLevel(*logger, config.level);
+    }
+  }
+
+  // Taken from
+  // https://github.com/marian-nmt/marian-dev/blob/c84599d08ad69059279abd5a7417a8053db8b631/src/common/logging.cpp#L45
+  static bool setLoggingLevel(spdlog::logger &logger, std::string const level) {
+    if (level == "trace")
+      logger.set_level(spdlog::level::trace);
+    else if (level == "debug")
+      logger.set_level(spdlog::level::debug);
+    else if (level == "info")
+      logger.set_level(spdlog::level::info);
+    else if (level == "warn")
+      logger.set_level(spdlog::level::warn);
+    else if (level == "err" || level == "error")
+      logger.set_level(spdlog::level::err);
+    else if (level == "critical")
+      logger.set_level(spdlog::level::critical);
+    else if (level == "off")
+      logger.set_level(spdlog::level::off);
+    else {
+      logger.warn("Unknown log level '{}' for logger '{}'", level.c_str(), logger.name().c_str());
+      return false;
+    }
+    return true;
+  }
+
+  ~Logger() {
+    // We need to manually destroy the loggers, as marian doesn't do
+    // that but will complain when a new marian::Config tries to
+    // initialise loggers with the same name.
+    for (auto &logger : marianLoggers_) {
+      if (logger) {
+        spdlog::drop(logger->name());
+      }
+    }
+  }
+
+  // Explicit destructor above is an indicator we should not allow this class to copy-construct.
+  Logger &operator=(const Logger &) = delete;
+  Logger(const Logger &) = delete;
+
+ private:
+  using MarianLogger = std::shared_ptr<spdlog::logger>;
+  std::vector<MarianLogger> marianLoggers_;
+};
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/parser.cpp b/inference/src/translator/parser.cpp
new file mode 100644
index 000000000..2636b7472
--- /dev/null
+++ b/inference/src/translator/parser.cpp
@@ -0,0 +1,94 @@
+#include "parser.h"
+
+#include <unordered_map>
+
+#include "common/build_info.h"
+#include "common/config.h"
+#include "common/regex.h"
+#include "common/version.h"
+
+namespace marian {
+namespace bergamot {
+
+std::shared_ptr<marian::Options> parseOptionsFromFilePath(const std::string &configPath, bool validate /*= true*/) {
+  // Read entire string and redirect to parseOptionsFromString
+  std::ifstream readStream(configPath);
+  std::stringstream buffer;
+  buffer << readStream.rdbuf();
+  return parseOptionsFromString(buffer.str(), validate, /*pathsInSameDirAs=*/configPath);
+};
+
+std::shared_ptr<marian::Options> parseOptionsFromString(const std::string &configAsString, bool validate /*= true*/,
+                                                        std::string pathsInSameDirAs /*=""*/) {
+  marian::Options options;
+
+  marian::ConfigParser configParser(cli::mode::translation);
+
+  // These are additional options we use to hijack for our own marian-replacement layer (for batching,
+  // multi-request-compile etc) and hence goes into Ptr<Options>.
+  configParser.addOption<size_t>("--max-length-break", "Bergamot Options",
+                                 "Maximum input tokens to be processed in a single sentence.", 128);
+
+  // The following is a complete hijack of an existing option, so no need to add explicitly.
+  // configParser.addOption<size_t>("--mini-batch-words", "Bergamot Options",
+  //                                "Maximum input tokens to be processed in a single sentence.", 1024);
+
+  configParser.addOption<std::string>("--ssplit-prefix-file", "Bergamot Options",
+                                      "File with nonbreaking prefixes for sentence splitting.");
+
+  configParser.addOption<std::string>("--ssplit-mode", "Bergamot Options", "[paragraph, sentence, wrapped_text]",
+                                      "paragraph");
+
+  configParser.addOption<std::string>("--quality", "Bergamot Options", "File considering Quality Estimation model");
+
+  // Parse configs onto defaultConfig. The preliminary merge sets the YAML internal representation with legal values.
+  const YAML::Node &defaultConfig = configParser.getConfig();
+  options.merge(defaultConfig);
+  options.parse(configAsString);
+
+  // This is in a marian `.cpp` as of now, and requires explicit copy-here.
+  // https://github.com/marian-nmt/marian-dev/blob/9fa166be885b025711f27b35453e0f2c00c9933e/src/common/config_parser.cpp#L28
+
+  // clang-format off
+  const std::set<std::string> PATHS = {
+      "model",
+      "models",
+      "train-sets",
+      "vocabs",
+      "embedding-vectors",
+      "valid-sets",
+      "valid-script-path",
+      "valid-script-args",
+      "valid-log",
+      "valid-translation-output",
+      "input",   // except: 'stdin', handled in makeAbsolutePaths and interpolateEnvVars
+      "output",  // except: 'stdout', handled in makeAbsolutePaths and interpolateEnvVars
+      "pretrained-model",
+      "data-weighting",
+      "log",
+      "sqlite",     // except: 'temporary', handled in the processPaths function
+      "shortlist",  // except: only the first element in the sequence is a path, handled in the
+                    //  processPaths function
+      "ssplit-prefix-file", // added for bergamot
+      "quality", // added for bergamot
+  };
+  // clang-format on
+
+  if (!pathsInSameDirAs.empty()) {
+    YAML::Node configYAML = options.cloneToYamlNode();
+    marian::cli::makeAbsolutePaths(configYAML, pathsInSameDirAs, PATHS);
+    options.merge(configYAML, /*overwrite=*/true);
+  }
+
+  // Perform validation on parsed options only when requested
+  if (validate) {
+    YAML::Node configYAML = options.cloneToYamlNode();
+    marian::ConfigValidator validator(configYAML);
+    validator.validateOptions(marian::cli::mode::translation);
+  }
+
+  return std::make_shared<marian::Options>(options);
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/parser.h b/inference/src/translator/parser.h
new file mode 100644
index 000000000..8f98e2c73
--- /dev/null
+++ b/inference/src/translator/parser.h
@@ -0,0 +1,111 @@
+#ifndef SRC_BERGAMOT_PARSER_H
+#define SRC_BERGAMOT_PARSER_H
+
+#include <fstream>
+#include <sstream>
+
+#include "3rd_party/browsermt-marian-dev/src/3rd_party/CLI/CLI.hpp"
+#include "3rd_party/yaml-cpp/yaml.h"
+#include "common/build_info.h"
+#include "common/config_parser.h"
+#include "common/config_validator.h"
+#include "common/options.h"
+#include "marian.h"
+
+namespace marian {
+namespace bergamot {
+
+template <class Service>
+struct CLIConfig {
+  using ServiceConfig = typename Service::Config;
+  using ModelConfigPaths = std::vector<std::string>;
+
+  std::string opMode;
+
+  // For marian-models we retain the old marian-yml configs to a large extent. These are supplied as file-paths to the
+  // CLI. For multiple model workflows, we allow more than one model config to be supplied. How to process the models
+  // provided is decided by the application.
+  ModelConfigPaths modelConfigPaths;
+
+  ServiceConfig serviceConfig;
+
+  /// All config in bergamot has the following templated addOptions(...) method hierarchically placing parse actions on
+  /// "option-groups" in nested structs. This allows to keep additional documentation and information on defaults
+  /// alongside. Since this is templated with App, we don't add a CLI11 dependency in any configs, thus CLI11 not coming
+  /// into the picture until the parser is instantiated.
+  template <class App>
+  static void addOptions(App &app, CLIConfig<Service> &config, bool multiOpMode = false) {
+    if (multiOpMode) {
+      app.add_option("--bergamot-mode", config.opMode, "");
+    }
+    app.add_option("--model-config-paths", config.modelConfigPaths,
+                   "Configuration files list, can be used for pivoting multiple models or multiple model workflows");
+
+    ServiceConfig::addOptions(app, config.serviceConfig);
+  };
+};
+
+/// ConfigParser for bergamot. Internally stores config options with CLIConfig. CLI11 parsing binds the parsing code to
+/// write to the members of the CLIConfig instance owned by this class. Usage:
+///
+/// ```cpp
+/// ConfigParser configParser;
+/// configParser.parseArgs(argc, argv);
+/// auto &config = configParser.getConfig();
+/// ```
+template <class Service>
+class ConfigParser {
+ public:
+  ConfigParser(const std::string &appName, bool multiOpMode = false) : app_{appName} {
+    addSpecialOptions(app_);
+    CLIConfig<Service>::addOptions(app_, config_, multiOpMode);
+  };
+  void parseArgs(int argc, char *argv[]) {
+    try {
+      app_.parse(argc, argv);
+      handleSpecialOptions();
+    } catch (const CLI::ParseError &e) {
+      exit(app_.exit(e));
+    }
+  };
+  const CLIConfig<Service> &getConfig() { return config_; }
+
+ private:
+  // Special Options: build-info and version. These are not taken down further, the respective logic executed and
+  // program exits after.
+  void addSpecialOptions(CLI::App &app) {
+    app.add_flag("--build-info", build_info_, "Print build-info and exit");
+    app.add_flag("--version", version_, "Print version-info and exit");
+  };
+
+  void handleSpecialOptions() {
+    if (build_info_) {
+#ifndef _MSC_VER  // cmake build options are not available on MSVC based build.
+      std::cerr << cmakeBuildOptionsAdvanced() << std::endl;
+      exit(0);
+#else   // _MSC_VER
+      ABORT("build-info is not available on MSVC based build.");
+#endif  // _MSC_VER
+    }
+
+    if (version_) {
+      std::cerr << buildVersion() << std::endl;
+      exit(0);
+    }
+  }
+
+  CLIConfig<Service> config_;
+  CLI::App app_;
+
+  bool build_info_{false};
+  bool version_{false};
+};
+
+std::shared_ptr<marian::Options> parseOptionsFromString(const std::string &config, bool validate = true,
+                                                        std::string pathsInSameDirAs = "");
+std::shared_ptr<marian::Options> parseOptionsFromFilePath(const std::string &config, bool validate = true);
+
+}  // namespace bergamot
+}  //  namespace marian
+
+#endif  //  SRC_BERGAMOT_PARSER_H
diff --git a/inference/src/translator/project_version.h.in b/inference/src/translator/project_version.h.in
new file mode 100644
index 000000000..b7a0d04b3
--- /dev/null
+++ b/inference/src/translator/project_version.h.in
@@ -0,0 +1,19 @@
+#pragma once
+
+/*
+ * File project_version.h is generated using CMake. Do not modify project_version.h manually!
+ * Edit project_version.h.in file instead.
+ */
+
+#include <string>
+
+namespace marian {
+namespace bergamot {
+
+std::string bergamotBuildVersion() {
+    // e.g. v1.2.3-alpha.1.1+abc123d
+    return "@PROJECT_VERSION_STRING_FULL@";
+}
+
+} // namespace bergamot
+} // namespace marian
diff --git a/inference/src/translator/quality_estimator.cpp b/inference/src/translator/quality_estimator.cpp
new file mode 100644
index 000000000..24ca2c2aa
--- /dev/null
+++ b/inference/src/translator/quality_estimator.cpp
@@ -0,0 +1,270 @@
+#include "quality_estimator.h"
+
+namespace marian::bergamot {
+
+void UnsupervisedQualityEstimator::computeQualityScores(const Histories& histories, Response& response) const {
+  for (size_t i = 0; i < histories.size(); ++i) {
+    const Result result = histories[i]->top();
+    const Hypothesis::PtrType& hypothesis = std::get<1>(result);
+    const std::vector<float> logProbs = hypothesis->tracebackWordScores();
+    response.qualityScores.push_back(std::move(computeSentenceScores(logProbs, response.target, i)));
+  }
+}
+
+Response::SentenceQualityScore UnsupervisedQualityEstimator::computeSentenceScores(const std::vector<float>& logProbs,
+                                                                                   const AnnotatedText& target,
+                                                                                   const size_t sentenceIdx) const {
+  const std::vector<SubwordRange> wordIndices = mapWords(logProbs, target, sentenceIdx);
+
+  std::vector<float> wordScores;
+
+  for (const SubwordRange& wordIndice : wordIndices) {
+    wordScores.push_back(
+        std::accumulate(logProbs.begin() + wordIndice.begin, logProbs.begin() + wordIndice.end, float(0.0)) /
+        wordIndice.size());
+  }
+
+  const float sentenceScore =
+      std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();
+
+  return {wordScores, wordIndices, sentenceScore};
+}
+
+LogisticRegressorQualityEstimator::Matrix::Matrix(const size_t rowsParam, const size_t colsParam)
+    : rows(rowsParam), cols(colsParam), data_(rowsParam * colsParam) {}
+
+LogisticRegressorQualityEstimator::Matrix::Matrix(Matrix&& other)
+    : rows(other.rows), cols(other.cols), data_(std::move(other.data_)) {}
+
+const float& LogisticRegressorQualityEstimator::Matrix::at(const size_t row, const size_t col) const {
+  return data_[row * cols + col];
+}
+
+float& LogisticRegressorQualityEstimator::Matrix::at(const size_t row, const size_t col) {
+  return data_[row * cols + col];
+}
+
+LogisticRegressorQualityEstimator::LogisticRegressorQualityEstimator(Scale&& scale, Array&& coefficients,
+                                                                     const float intercept)
+    : scale_(std::move(scale)), coefficients_(std::move(coefficients)), intercept_(intercept), coefficientsByStds_() {
+  // Pre-compute the scale operations for the linear model
+  for (int i = 0; i < coefficients_.size(); ++i) {
+    coefficientsByStds_[i] = coefficients_[i] / scale_.stds[i];
+    constantFactor_ += coefficientsByStds_[i] * scale_.means[i];
+  }
+}
+
+LogisticRegressorQualityEstimator::LogisticRegressorQualityEstimator(LogisticRegressorQualityEstimator&& other)
+    : scale_(std::move(other.scale_)),
+      coefficients_(std::move(other.coefficients_)),
+      intercept_(std::move(other.intercept_)),
+      coefficientsByStds_(std::move(other.coefficientsByStds_)),
+      constantFactor_(std::move(other.constantFactor_)) {}
+
+LogisticRegressorQualityEstimator LogisticRegressorQualityEstimator::fromAlignedMemory(
+    const AlignedMemory& alignedMemory) {
+  LOG(info, "[data] Loading Quality Estimator model from buffer");
+
+  const char* ptr = alignedMemory.begin();
+  const size_t blobSize = alignedMemory.size();
+
+  ABORT_IF(blobSize < sizeof(Header), "Quality estimation file too small");
+  const Header& header = *reinterpret_cast<const Header*>(ptr);
+
+  ABORT_IF(header.magic != BINARY_QE_MODEL_MAGIC, "Incorrect magic bytes for quality estimation file");
+  ABORT_IF(header.lrParametersDims <= 0, "The number of lr parameter dimension cannot be equal or less than zero");
+
+  const uint64_t expectedSize =
+      sizeof(Header) + (numLrParamsWithDimension_ * header.lrParametersDims + numIntercept_) * sizeof(float);
+  ABORT_IF(expectedSize != blobSize, "QE header claims file size should be {} bytes but file is {} bytes", expectedSize,
+           blobSize);
+
+  ptr += sizeof(Header);
+  const float* memoryIndex = reinterpret_cast<const float*>(ptr);
+
+  const float* stds = memoryIndex;
+  const float* means = memoryIndex += header.lrParametersDims;
+  const float* coefficientsMemory = memoryIndex += header.lrParametersDims;
+  const float intercept = *(memoryIndex += header.lrParametersDims);
+
+  Scale scale;
+
+  Array coefficients;
+
+  for (int i = 0; i < header.lrParametersDims; ++i) {
+    scale.stds[i] = *(stds + i);
+
+    ABORT_IF(scale.stds[i] == 0.0, "Invalid stds");
+
+    scale.means[i] = *(means + i);
+    coefficients[i] = *(coefficientsMemory + i);
+  }
+
+  return LogisticRegressorQualityEstimator(std::move(scale), std::move(coefficients), intercept);
+}
+
+AlignedMemory LogisticRegressorQualityEstimator::toAlignedMemory() const {
+  const size_t lrParametersDims = scale_.means.size();
+
+  const size_t lrSize =
+      (scale_.means.size() + scale_.stds.size() + coefficients_.size()) * sizeof(float) + sizeof(intercept_);
+
+  Header header = {BINARY_QE_MODEL_MAGIC, lrParametersDims};
+  marian::bergamot::AlignedMemory memory(sizeof(header) + lrSize);
+
+  char* buffer = memory.begin();
+
+  memcpy(buffer, &header, sizeof(header));
+  buffer += sizeof(header);
+
+  for (const float std : scale_.stds) {
+    memcpy(buffer, &std, sizeof(std));
+    buffer += sizeof(std);
+  }
+
+  for (const float mean : scale_.means) {
+    memcpy(buffer, &mean, sizeof(mean));
+    buffer += sizeof(mean);
+  }
+
+  for (size_t i = 0; i < lrParametersDims; ++i) {
+    const float coefficient = coefficients_[i];
+    memcpy(buffer, &coefficient, sizeof(coefficient));
+    buffer += sizeof(coefficient);
+  }
+
+  memcpy(buffer, &intercept_, sizeof(intercept_));
+  buffer += sizeof(intercept_);
+
+  return memory;
+}
+
+void LogisticRegressorQualityEstimator::computeQualityScores(const Histories& histories, Response& response) const {
+  for (size_t i = 0; i < histories.size(); ++i) {
+    const Result result = histories[i]->top();
+    const Hypothesis::PtrType& hypothesis = std::get<1>(result);
+    const std::vector<float> logProbs = hypothesis->tracebackWordScores();
+
+    response.qualityScores.push_back(std::move(computeSentenceScores(logProbs, response.target, i)));
+  }
+}
+
+Response::SentenceQualityScore LogisticRegressorQualityEstimator::computeSentenceScores(
+    const std::vector<float>& logProbs, const AnnotatedText& target, const size_t sentenceIdx) const
+
+{
+  const std::vector<SubwordRange> wordIndices = mapWords(logProbs, target, sentenceIdx);
+
+  const std::vector<float> wordScores = predict(extractFeatures(wordIndices, logProbs));
+
+  const float sentenceScore =
+      std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();
+
+  return {wordScores, wordIndices, sentenceScore};
+}
+
+std::vector<float> LogisticRegressorQualityEstimator::predict(const Matrix& features) const {
+  std::vector<float> scores(features.rows);
+
+  for (int i = 0; i < features.rows; ++i) {
+    for (int j = 0; j < features.cols; ++j) {
+      scores[i] += features.at(i, j) * coefficientsByStds_[j];
+    }
+  }
+
+  /// Applies the linear model followed by a sigmoid function to each element
+
+  for (int i = 0; i < features.rows; ++i) {
+    scores[i] = std::log(1 - (1 / (1 + std::exp(-(scores[i] - constantFactor_ + intercept_)))));
+  }
+
+  return scores;
+}
+// Preprocess input data to provide correct features for the LogisticRegression model. Currently, there are
+// four features: mean of the log probability for a given word (remember that a word is made of a few subword tokens);
+// the minimum log probability of the subword level tokens that a given word is made of; the number of subword level
+// tokens that a word is made of and the overall log probability mean of the entire sequence
+LogisticRegressorQualityEstimator::Matrix LogisticRegressorQualityEstimator::extractFeatures(
+    const std::vector<SubwordRange>& wordIndices, const std::vector<float>& logProbs) const {
+  if (wordIndices.empty()) {
+    return std::move(Matrix(0, 0));
+  }
+  // The number of features (numFeatures), which is currently must be 4
+  Matrix features(wordIndices.size(), /*numFeatures =*/4);
+  size_t featureRow = 0;
+  // I_MEAN = index position in the feature vector hat represents the mean of log probability of a given word
+  // I_MIN = index position  in the feature vector that represents the minimum of log probability of a given word
+  // I_NUM_SUBWORDS = index position in the feature vector that represents the number of subwords that compose a given
+  // I_OVERALL_MEAN = index position in the feature vector that represents the overall log probability score in the
+  // entire sequence
+  const size_t I_MEAN{0}, I_MIN{1}, I_NUM_SUBWORDS{2}, I_OVERALL_MEAN{3};
+
+  float overallMean = 0.0;
+  size_t numlogProbs = 0;
+
+  for (const SubwordRange& wordIndice : wordIndices) {
+    if (wordIndice.begin == wordIndice.end) {
+      ++featureRow;
+      continue;
+    }
+
+    float minScore = std::numeric_limits<float>::max();
+
+    for (size_t i = wordIndice.begin; i < wordIndice.end; ++i) {
+      ++numlogProbs;
+      overallMean += logProbs[i];
+      features.at(featureRow, I_MEAN) += logProbs[i];
+
+      minScore = std::min<float>(logProbs[i], minScore);
+    }
+
+    features.at(featureRow, I_MEAN) /= static_cast<float>(wordIndice.size());
+    features.at(featureRow, I_MIN) = minScore;
+    features.at(featureRow, I_NUM_SUBWORDS) = wordIndice.size();
+
+    ++featureRow;
+  }
+
+  if (numlogProbs == 0) {
+    return std::move(Matrix(0, 0));
+  }
+
+  overallMean /= wordIndices.rbegin()->end;
+
+  for (int i = 0; i < features.rows; ++i) {
+    features.at(i, I_OVERALL_MEAN) = overallMean;
+  }
+
+  return std::move(features);
+}
+
+std::vector<SubwordRange> mapWords(const std::vector<float>& logProbs, const AnnotatedText& target,
+                                   const size_t sentenceIdx) {
+  // Ignore empty target
+  if ((logProbs.size() < 2) || (target.numWords(sentenceIdx) == 0)) {
+    return {};
+  }
+  // It is expected that translated words will have at least one word
+  std::vector<SubwordRange> wordIndices(/*numWords=*/1);
+
+  /// The LogisticRegressorQualityEstimator model ignores the presence of the EOS token, and hence we only need to
+  /// iterate n-1 positions.
+  for (size_t subwordIdx = 0; subwordIdx < (logProbs.size() - 1); ++subwordIdx) {
+    ByteRange subword = target.wordAsByteRange(sentenceIdx, subwordIdx);
+
+    const char firstLetter = target.text.at(subword.begin);
+
+    // if the first character is whitespace, it's a beginning of a new word
+    if (isspace(firstLetter)) {
+      wordIndices.back().end = subwordIdx;
+      wordIndices.emplace_back();
+      wordIndices.back().begin = subwordIdx;
+    }
+  }
+
+  wordIndices.back().end = logProbs.size() - 1;
+
+  return wordIndices;
+}
+
+}  // namespace marian::bergamot
diff --git a/inference/src/translator/quality_estimator.h b/inference/src/translator/quality_estimator.h
new file mode 100644
index 000000000..b8d15963c
--- /dev/null
+++ b/inference/src/translator/quality_estimator.h
@@ -0,0 +1,210 @@
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "annotation.h"
+#include "response.h"
+#include "translator/history.h"
+
+namespace marian::bergamot {
+
+class QualityEstimator {
+ public:
+  /// Computes quality-scores using values from Histories and subword tokens which comes from Response
+  ///
+  ///
+  /// @param [in] histories: Histories obtained from translating a blob of source-text
+  /// @param [in] response: Partially constructed response, holding tokenization info
+  /// for source and target. The quality-scores for each sentence obtained from source-text blob
+  /// are written out as SentenceQualityEstimate into response.
+  virtual void computeQualityScores(const Histories &histories, Response &response) const = 0;
+};
+
+/// Unsupervised Quality Estimator model. It uses the translator model's log probabilities (log probs) as a proxy for
+/// quality scores. Then, for a given word, its quality score is computed by taking the mean of the log probs of the
+/// tokens that make it up. The sentence score is the mean of all word's log probs.
+class UnsupervisedQualityEstimator : public QualityEstimator {
+ public:
+  void computeQualityScores(const Histories &histories, Response &response) const override;
+
+ private:
+  Response::SentenceQualityScore computeSentenceScores(const std::vector<float> &logProbs, const AnnotatedText &target,
+                                                       const size_t sentenceIdx) const;
+};
+
+// ASCII and Unicode text files never start with the following 64 bits
+// It serves as a signature for quality estimator binary files
+constexpr std::uint64_t BINARY_QE_MODEL_MAGIC = 0x78cc336f1d54b180;
+
+/// LogisticRegressorQualityEstimator model implementation through a linear regressor + sigmoid function. Simply
+/// speaking, an LR model depends on features to be scaled, so it contains four elements of data: a vector of
+/// coefficients and an intercept (which represents the linear model) and a vector of means and stds (which are
+/// necessary for feature scaling). These variables are firstly initialized by parsing a file (which comes from
+/// `fromAlignedMemory`), and then they are used to build a model representation
+class LogisticRegressorQualityEstimator : public QualityEstimator {
+ public:
+  using Array = std::array<float, /*LRParamsDims = */ 4>;
+
+  struct Header {
+    /// Binary QE File magic number
+    uint64_t magic;
+    /// Length of lr parameters stds, means and coefficients.
+    uint64_t lrParametersDims;
+  };
+  /// Struct that contains information for applying standard scaling
+  struct Scale {
+    /// Array of standard deviations of feature values. Its length will be equals as featureDims
+    Array stds;
+    /// Array of mean of feature values. Its length will be equals as featureDims
+    Array means;
+  };
+  /// Matrix is an internal data structure that was created only to be used in LogisticRegressorQualityEstimator
+  /// methods. It intends to represent a matrix, so it receives row and column values as a constructor. Furthermore, the
+  /// method `at` can access specific data given a row and col position.
+  class Matrix {
+   public:
+    /// Number of rows
+    const size_t rows;
+    /// Number of columns
+    const size_t cols;
+
+    /// @param [in] rowsParam: number of rows in the Matrix
+    /// @param [in] colsParam: number of columns in the Matrix
+    Matrix(const size_t rowsParam, const size_t colsParam);
+    /// Move constructor
+    Matrix(Matrix &&other);
+
+    /// Return data value given a row and col position
+    /// @param [in] row: row position
+    /// @param [in] col: col position
+    const float &at(const size_t row, const size_t col) const;
+    float &at(const size_t row, const size_t col);
+
+   private:
+    std::vector<float> data_;
+  };
+  /// Logistic Regressor constructor. It creates a LR model that fits proper for the QualityEstimator use.
+  ///
+  ///
+  /// @param [in] scale: Array of stds and means that can be used to apply standard scaling in the features
+  /// @param [in] coefficients: coefficient values of linear part of LR model
+  /// @param [in] intercept: intercept value of the linear part of LR model
+  LogisticRegressorQualityEstimator(Scale &&scale, Array &&coefficients, const float intercept);
+
+  /// Move constructor
+  LogisticRegressorQualityEstimator(LogisticRegressorQualityEstimator &&other);
+
+  /// Binary file parser which came from AlignedMemory
+  /// It's expected from AlignedMemory the following structure:
+  /// - -a header with the number of parameters dimensions
+  /// - -a vector of standard deviations of features
+  /// - -a vector of means of features
+  /// - -a vector of coefficients
+  /// - -a intercept value
+  static LogisticRegressorQualityEstimator fromAlignedMemory(const AlignedMemory &alignedMemory);
+  AlignedMemory toAlignedMemory() const;
+
+  void computeQualityScores(const Histories &histories, Response &response) const override;
+  /// Given an input matrix \f$\mathbf{X}\f$, the usual Logistic Regression calculus can be seen as the following:
+  ///
+  /// 1) Standardize it, returning in \f$\mathbf{Z} = \frac{(\mathbf{X}-\mu)}{\sigma}\f$, where \f$\mu\f$ stands for the
+  /// mean vector and \f$\sigma\f$ represents the standard deviation
+  ///
+  /// 2) Then, we apply \f$\sum_{i=1}^{D}{ w_i z_i}\f$, where \f$D\f$ is the dimension (i.e. the number of features) and
+  /// \f$w\f$ is the model vector with learnt weights
+  ///
+  /// 3) We apply the sigmoid function to the result
+  ///
+  /// Notice, however, that for the first two steps we can do the following:
+  ///
+  /// \f{align*}{
+  /// \sum_{i=1}^{D}{ w_i z_i} &= \mathbf{w^T}\left(\mathbf{\sigma^{-1}} \odot (\mathbf{x} - \mathbf{\mu})\right) \text{
+  /// //
+  /// we are just vectoring step 1}\\
+  ///      &= \sum_{i=1}^{D}{\sigma_i^{-1} w_i (x_i - \mu_i)} \\
+  ///      &= \sum_{i=1}^{D}{\sigma_i^{-1} w_ix_i - \sigma_i^{-1} w_i \mu_i} \\
+  ///      &= \sum_{i=1}^{D}{\left(\sigma_i^{-1} w_i\right)x_i - \left(\sigma_i^{-1} w_i \mu_i\right)}
+  /// \f}
+  /// Then, \f$(\sigma_i^{-1} w_i \mu_i)\f$ can be precomputed without any dependence on inference data. This is done by
+  /// the variable \f$\textit{constantFactor_}\f$ and \f$\textit{intercept_}\f$ in the code.
+  ///
+  /// @param [in] features: A Matrix struct of features. For a defintion what features currently means, please refer to
+  /// `extractFeatures` method in `quality_estimator.cpp`
+  std::vector<float> predict(const Matrix &features) const;
+
+ private:
+  Scale scale_;
+  Array coefficients_;
+  float intercept_;
+  Array coefficientsByStds_;
+  float constantFactor_ = 0.0;
+
+  // Number of parameters with dimension - Scale(stds, means) and coefficients
+  static constexpr const size_t numLrParamsWithDimension_ = 3;
+  // Number of intercept values
+  static constexpr const size_t numIntercept_ = 1;
+
+  /// construct the struct SentenceQualityEstimate
+  /// @param [in] logProbs: the log probabilities given by an translation model
+  /// @param [in] target: AnnotatedText target value
+  /// @param [in] sentenceIdx: the id of a candidate sentence
+  Response::SentenceQualityScore computeSentenceScores(const std::vector<float> &logProbs, const AnnotatedText &target,
+                                                       const size_t sentenceIdx) const;
+
+  Matrix extractFeatures(const std::vector<SubwordRange> &wordIndices, const std::vector<float> &logProbs) const;
+};
+
+/// createQualityEstimator model takes an `AlignedMemory`, which is the return from `getQualityEstimatorModel`.
+///
+/// `getQualityEstimatorModel` contains two different implementations, one when the `quality` argument has some value as
+/// a possible `Options` and where it does not.
+///
+/// If a non `quality` option is provided, then by default, it uses the UnsupervisedQualityEstimator implementation.
+///
+/// If a value is passed to the `quality` argument, the model file is read and converted into an `AlignedMemory`
+/// structure, which instantiates a QualityEstimator object.
+
+/// @param [in] qualityFileMemory: An `AlignedMemory` which is created by parsing a QE model binary file through
+/// getQualityEstimatorModel
+inline std::shared_ptr<QualityEstimator> createQualityEstimator(const AlignedMemory &qualityFileMemory) {
+  // If no quality file return simple model
+  if (qualityFileMemory.size() == 0) {
+    return std::make_shared<UnsupervisedQualityEstimator>();
+  }
+
+  return std::make_shared<LogisticRegressorQualityEstimator>(
+      LogisticRegressorQualityEstimator::fromAlignedMemory(qualityFileMemory));
+}
+
+/// A word is composed of multiple subtokens. Entire words are tokens splitted by whitespace.
+/// This method takes a sequence of sublevel tokens (given by AnnotatedText) as well aligned with their log
+/// probabilities and conflate them to their respective words
+/// The return of this function is a SubwordRange (an alias of ByteRange) vector where each value corresponds to a word
+/// id and its content represent the range of subword value that compose a given word
+///
+/// If a translated sentence does not contain any alphanumeric character (therefore, it is made basically of the EOS
+/// token), this method ignores it and returns an empty ByteRange vector of words.
+///
+/// Examples:
+/// Suppose that you have the following source target (A): marian is a good translation service and the translate
+/// service gives you the following sentence (B):
+/// service gives you the following sentence (B):
+///
+/// ma(0.15) ri(0.15) an(0.2) es(0.3) un(0.1) bu(0.3) en(0.2) ser(0.1) vi(0.2) cio(0.4) de(0.1) tra(0.4) du(0.2)
+/// cción(0.1)
+///
+/// The numbers that the words follow represent the logProb of each BPE token.
+///
+/// Then, the result would be something like:
+/// a vector where each position corresponds to the SubwordRange of the following words: marian
+/// es un buen servicio de traducción. Hence, its length is 7. The value of the first element would be [0,3)
+
+/// @param [in] logProbs: the log probabilities of byte pair encodings (BPE) that comes from the tracebackWordScores
+/// method (which belongs to hypothesis.h in Marian)
+/// @param [in] target: AnnotatedText target value
+/// @param [in] sentenceIdx: the id of a candidate sentence
+std::vector<SubwordRange> mapWords(const std::vector<float> &logProbs, const AnnotatedText &target,
+                                   const size_t sentenceIdx);
+
+}  // namespace marian::bergamot
diff --git a/inference/src/translator/request.cpp b/inference/src/translator/request.cpp
new file mode 100644
index 000000000..fd8fe8fca
--- /dev/null
+++ b/inference/src/translator/request.cpp
@@ -0,0 +1,121 @@
+#include "request.h"
+
+#include <string>
+
+#include "annotation.h"
+#include "cache.h"
+#include "common/logging.h"
+#include "definitions.h"
+#include "response.h"
+#include "translation_model.h"
+
+namespace marian {
+namespace bergamot {
+
+size_t hashForCache(const TranslationModel &model, const marian::Words &words) {
+  size_t seed = model.modelId();
+  for (auto &word : words) {
+    size_t hashWord = static_cast<size_t>(word.toWordIndex());
+    util::hash_combine<size_t>(seed, hashWord);
+  }
+  return seed;
+}
+
+// -----------------------------------------------------------------
+Request::Request(size_t Id, const TranslationModel &model, Segments &&segments, ResponseBuilder &&responseBuilder,
+                 std::optional<TranslationCache> &cache)
+    : Id_(Id),
+      model_(model),
+      segments_(std::move(segments)),
+      responseBuilder_(std::move(responseBuilder)),
+      cache_(cache) {
+  counter_ = segments_.size();
+  histories_.resize(segments_.size(), nullptr);
+
+  // 1. If there are no segments_, we are never able to trigger the responseBuilder calls from a different thread. This
+  // happens when the use provides empty input, or the sentence and subword preprocessing deems no translatable units
+  // present. However, in this case we want an empty valid response. There's no need to do any additional processing
+  // here.
+  if (segments_.size() == 0) {
+    responseBuilder_(std::move(histories_));
+  } else {
+    counter_ = segments_.size();
+    histories_.resize(segments_.size());
+
+    if (cache_) {
+      // Iterate through segments, see if any can be prefilled from cache. If prefilled, mark the particular segments as
+      // complete (non-empty ProcessedRequestSentence). Also update accounting used elsewhere (counter_) to reflect one
+      // less segment to translate.
+      for (size_t idx = 0; idx < segments_.size(); idx++) {
+        size_t key = hashForCache(model_, getSegment(idx));
+        auto [found, history] = cache_->find(key);
+        if (found) {
+          histories_[idx] = history;
+          --counter_;
+        }
+      }
+      // 2. Also, if cache somehow manages to decrease all counter prefilling histories, then we'd have to trigger
+      // ResponseBuilder as well. No segments go into batching and therefore no processHistory triggers.
+      if (counter_.load() == 0) {
+        responseBuilder_(std::move(histories_));
+      }
+    }
+  }
+}
+
+size_t Request::numSegments() const { return segments_.size(); }
+
+size_t Request::segmentTokens(size_t index) const { return (segments_[index].size()); }
+
+Segment Request::getSegment(size_t index) const { return segments_[index]; }
+
+void Request::processHistory(size_t index, Ptr<History> history) {
+  // Concurrently called by multiple workers as a history from translation is
+  // ready. The container storing histories is set with the value obtained.
+
+  // Fill in placeholder from History obtained by freshly translating. Since this was a cache-miss to have got through,
+  // update cache if available to store the result.
+  histories_[index] = history;
+  if (cache_) {
+    size_t key = hashForCache(model_, getSegment(index));
+    cache_->store(key, histories_[index]);
+  }
+
+  // In case this is last request in, completeRequest is called, which sets the
+  // value of the promise.
+  if (--counter_ == 0) {
+    responseBuilder_(std::move(histories_));
+  }
+}
+
+bool Request::operator<(const Request &b) const {
+  // Among Requests, only sequence id is used for obtaining priority.
+  return Id_ < b.Id_;
+}
+
+// ------------------------------------------------------------------
+
+RequestSentence::RequestSentence(size_t index, Ptr<Request> request) : index_(index), request_(request) {}
+
+size_t RequestSentence::numTokens() const { return (request_->segmentTokens(index_)); }
+
+void RequestSentence::completeSentence(Ptr<History> history) {
+  // Relays completeSentence into request's processHistory, using index
+  // information.
+  request_->processHistory(index_, history);
+}
+
+Segment RequestSentence::getUnderlyingSegment() const { return request_->getSegment(index_); }
+
+bool operator<(const RequestSentence &a, const RequestSentence &b) {
+  // Operator overload for usage in priority-queue / set.
+  if (a.request_ == b.request_) {
+    return a.index_ < b.index_;
+  }
+  return a.request_ < b.request_;
+}
+
+// ----------------------------------------------------------------------
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/request.h b/inference/src/translator/request.h
new file mode 100644
index 000000000..0c4750b29
--- /dev/null
+++ b/inference/src/translator/request.h
@@ -0,0 +1,138 @@
+#ifndef SRC_BERGAMOT_REQUEST_H_
+#define SRC_BERGAMOT_REQUEST_H_
+
+#include <cassert>
+#include <future>
+#include <vector>
+
+#include "annotation.h"
+#include "cache.h"
+#include "common/logging.h"
+#include "data/types.h"
+#include "definitions.h"
+#include "response.h"
+#include "response_builder.h"
+#include "translator/beam_search.h"
+
+namespace marian {
+namespace bergamot {
+
+class TranslationModel;
+
+/// A Request is an internal representation used to represent a request after
+/// processed by TextProcessor into sentences constituted by marian::Words.
+///
+/// The batching mechanism (BatchingPool) draws from multiple Requests and compiles
+/// sentences into a batch. When a batch completes translation (at
+/// BatchTranslator, intended in a different thread), backward propogation
+/// happens through:
+///
+/// ```cpp
+///   Batch::completeBatch(...)
+///       -> RequestSentence::completeSentence(..)
+///          -> Request::processHistory(...)
+/// ```
+///
+/// When all sentences in a Request are completed, responseBuilder is
+/// triggered with the compiled Histories, to construct the Response
+/// corresponding to the Request and set value of the promise which triggers the
+/// future at client.
+class Request {
+ public:
+  /// Constructs an internal representation of the Request identified by Id,
+  /// processed Segments and accepts a callback (ResponseBuilder) which builds
+  /// the Response upon completion of the Request.
+  ///
+  ///
+  /// @param [in] Id: Identifier assigned to Request by Service.
+  /// @param [in] model: TranslationModel for identifying a unique translation unit key (model, words in a sentence) for
+  /// cache.
+  /// @param [in] segments: Each segment is a unit to be translated.
+  /// @param [in] responseBuilder: Callback function (of ResponseBuilder type)
+  /// to be triggered upon the completion of translation of all units in a
+  /// Request.
+  /// @param [in] cache: Cache supplied externally to attempt to fetch translations or store them after completion for
+  /// reuse later.
+  Request(size_t Id, const TranslationModel &model, Segments &&segments, ResponseBuilder &&responseBuilder,
+          std::optional<TranslationCache> &cache);
+
+  /// Obtain the count of tokens in the segment correponding to index. Used to
+  /// insert sentence from multiple requests into the corresponding size bucket.
+  size_t segmentTokens(size_t index) const;
+
+  /// Obtain number of segments in a request.
+  size_t numSegments() const;
+
+  /// Obtains segment corresponding to index  to create a batch of segments
+  /// among several requests.
+  Segment getSegment(size_t index) const;
+
+  /// For notions of priority among requests, used to enable std::set in
+  /// BatchingPool.
+  bool operator<(const Request &request) const;
+
+  /// Processes a history obtained after translating in a heterogenous batch
+  /// compiled from requests.
+  void processHistory(size_t index, Ptr<History> history);
+
+  bool cacheHitPrefilled(size_t index) const { return histories_[index] != nullptr; }
+
+ private:
+  size_t Id_;
+
+  /// TranslationModel associated with this request
+  const TranslationModel &model_;
+
+  /// Multiple translation-workers can concurrently access the same Request. The
+  /// following atomic atomically operates on the variable holding sentences
+  /// remaining to be translated.
+  std::atomic<int> counter_;
+
+  /// segments_ hold the sentences processed into Words which generated from
+  /// input string.
+  Segments segments_;
+
+  /// histories_ is a buffer which eventually stores the translations of each
+  /// segment in the corresponding index.
+  std::vector<Ptr<History>> histories_;
+
+  /// Constructing Response requires the vocabs_ used to generate Request.
+  /// std::vector<Ptr<Vocab const>> *vocabs_;
+  ResponseBuilder responseBuilder_;
+
+  /// Cache used to hold unit translations. If nullopt, means no-caching.
+  std::optional<TranslationCache> &cache_;
+};
+
+/// A RequestSentence provides a view to a sentence within a Request. Existence
+/// of this class allows the sentences and associated information to be kept
+/// within Request, while batching mechanism (BatchingPool) compiles Batch from
+/// RequestSentence-s coming from different Requests.
+class RequestSentence {
+ public:
+  RequestSentence(size_t, Ptr<Request>);
+
+  /// Number of tokens in the segment this RequestSentence represents. Used to
+  /// order by length in batching.
+  size_t numTokens() const;
+
+  /// Accessor to the segment represented by the RequestSentence.
+  Segment getUnderlyingSegment() const;
+
+  /// Forwards history to Request to set history corresponding to this
+  /// RequestSentence.
+  void completeSentence(Ptr<History> history);
+
+  friend bool operator<(const RequestSentence &a, const RequestSentence &b);
+
+ private:
+  size_t index_;
+  Ptr<Request> request_;
+};
+
+typedef std::vector<RequestSentence> RequestSentences;
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  // SRC_BERGAMOT_REQUEST_H_
diff --git a/inference/src/translator/response.cpp b/inference/src/translator/response.cpp
new file mode 100644
index 000000000..135ec4715
--- /dev/null
+++ b/inference/src/translator/response.cpp
@@ -0,0 +1,163 @@
+#include "response.h"
+
+#include "annotation.h"
+#include "definitions.h"
+
+namespace marian::bergamot {
+
+// We're marginalizing q out of p(s | q) x p( q | t). However, we have different representations of q on source side to
+// intermediate - p(s_i | q_j) and intermediate to target side - p(q'_j' | t_k).
+//
+// The matrix p(q'_j' | t_k) is rewritten into p(q_j | t_k) by means of spreading the probability in the former over
+// bytes and collecting it at the ranges specified by latter, using a two pointer accumulation strategy.
+Alignment transferThroughCharacters(const std::vector<ByteRange> &sourceSidePivots,
+                                    const std::vector<ByteRange> &targetSidePivots,
+                                    const Alignment &pivotGivenTargets) {
+  // Initialize an empty alignment matrix.
+  Alignment remapped(pivotGivenTargets.size(), std::vector<float>(sourceSidePivots.size(), 0.0f));
+
+  size_t sq, qt;
+  for (sq = 0, qt = 0; sq < sourceSidePivots.size() && qt < targetSidePivots.size();
+       /*each branch inside increments either sq or qt or both, therefore the loop terminates */) {
+    auto &sourceSidePivot = sourceSidePivots[sq];
+    auto &targetSidePivot = targetSidePivots[qt];
+    if (sourceSidePivot.begin == targetSidePivot.begin && sourceSidePivot.end == targetSidePivot.end) {
+      for (size_t t = 0; t < pivotGivenTargets.size(); t++) {
+        remapped[t][sq] += pivotGivenTargets[t][qt];
+      }
+
+      // Perfect match, move pointer from both.
+      sq++, qt++;
+    } else {
+      // Do we have overlap?
+      size_t left = std::max(targetSidePivot.begin, sourceSidePivot.begin);
+      size_t right = std::min(targetSidePivot.end, sourceSidePivot.end);
+
+      assert(left < right);  // there should be overlap.
+
+      size_t charCount = right - left;
+      size_t probSpread = targetSidePivot.size();
+      for (size_t t = 0; t < pivotGivenTargets.size(); t++) {
+        remapped[t][sq] += charCount * pivotGivenTargets[t][qt] / static_cast<float>(probSpread);
+      }
+
+      // Which one is ahead? sq or qt or both end at same point?
+      if (sourceSidePivot.end == targetSidePivot.end) {
+        sq++;
+        qt++;
+      } else if (sourceSidePivot.end > targetSidePivot.end) {
+        qt++;
+      } else {  // sourceSidePivot.end < targetSidePivot.end
+        sq++;
+      }
+    }
+  }
+
+  // The following is left in here for future debugging. Every token in source is expected to have been processed in the
+  // above pipeline. We advance the pivot-token index based on overlap with source-token. @jerinphilip is worried about
+  // EOS not existing when people try weird 4-model things in the future and would like to keep this check here.
+  assert(sq == sourceSidePivots.size());
+
+  while (qt < targetSidePivots.size()) {
+    // There is a case of EOS not being predicted. In this case the two pointer algorithm will fail. The just author
+    // will redistribute the surplus among subjects.
+
+    // assert in DEBUG, that this is only EOS - occuring at the end and with zero-surface.
+    assert(qt == targetSidePivots.size() - 1 && targetSidePivots[qt].size() == 0);
+    for (size_t t = 0; t < pivotGivenTargets.size(); t++) {
+      float gift = pivotGivenTargets[t][qt] / sourceSidePivots.size();
+      for (size_t sq = 0; sq < sourceSidePivots.size(); sq++) {
+        remapped[t][sq] += gift;
+      }
+    }
+
+    qt++;
+  }
+
+#ifdef DEBUG
+  // The following sanity check ensures when DEBUG is enabled that we have successfully transferred all probabily mass
+  // available over pivot tokens given a target token in our original input to the new remapped representation.
+  //
+  // It's been discovered that floating point arithmetic before we get the Alignment matrix can have values such that
+  // the distribution does not sum upto 1.
+  const float EPS = 1e-6;
+  for (size_t t = 0; t < pivotGivenTargets.size(); t++) {
+    float sum = 0.0f, expectedSum = 0.0f;
+    for (size_t qt = 0; qt < targetSidePivots.size(); qt++) {
+      expectedSum += pivotGivenTargets[t][qt];
+    }
+    for (size_t sq = 0; sq < sourceSidePivots.size(); sq++) {
+      sum += remapped[t][sq];
+    }
+    std::cerr << fmt::format("Sum @ token {} = {} to be compared with expected {}.", t, sum, expectedSum) << std::endl;
+    ABORT_IF(std::abs(sum - expectedSum) > EPS, "Haven't accumulated probabilities, re-examine");
+  }
+#endif  // DEBUG
+
+  return remapped;
+}
+
+std::vector<Alignment> remapAlignments(const Response &first, const Response &second) {
+  std::vector<Alignment> alignments;
+  for (size_t sentenceId = 0; sentenceId < first.source.numSentences(); sentenceId++) {
+    const Alignment &sourceGivenPivots = first.alignments[sentenceId];
+    const Alignment &pivotGivenTargets = second.alignments[sentenceId];
+
+    // TODO: Allow range iterators and change algorithm, directly tapping into AnnotatedText
+    // Extracts ByteRanges corresponding to a words constituting a sentence from an annotation.
+    auto extractWordByteRanges = [](const AnnotatedText &annotatedText,
+                                    size_t sentenceId) -> std::vector<marian::bergamot::ByteRange> {
+      size_t N = annotatedText.numWords(sentenceId);
+      std::vector<ByteRange> output;
+
+      for (size_t i = 0; i < N; i++) {
+        output.push_back(annotatedText.wordAsByteRange(sentenceId, i));
+      }
+      return output;
+    };
+
+    auto sourceSidePivots = extractWordByteRanges(first.target, sentenceId);
+    auto targetSidePivots = extractWordByteRanges(second.source, sentenceId);
+
+    // Reintrepret probability p(q'_j' | t_k) as p(q_j | t_k)
+    Alignment remappedPivotGivenTargets =
+        transferThroughCharacters(sourceSidePivots, targetSidePivots, pivotGivenTargets);
+
+    // Marginalize out q_j.
+    // p(s_i | t_k) = \sum_{j} p(s_i | q_j) x p(q_j | t_k)
+    size_t sourceTokenCount = first.source.numWords(sentenceId);
+    size_t targetTokenCount = second.target.numWords(sentenceId);
+    Alignment output(targetTokenCount, std::vector<float>(sourceTokenCount, 0.0f));
+    for (size_t idt = 0; idt < targetTokenCount; idt++) {
+      for (size_t idq = 0; idq < sourceSidePivots.size(); idq++) {
+        for (size_t ids = 0; ids < sourceTokenCount; ids++) {
+          // Matrices are of form p(s | t) = P[t][s], hence idq appears on the extremes.
+          output[idt][ids] += sourceGivenPivots[idq][ids] * remappedPivotGivenTargets[idt][idq];
+        }
+      }
+    }
+
+    alignments.push_back(output);
+  }
+  return alignments;
+}
+
+std::vector<ByteRange> getWordByteRanges(const Response &response, size_t sentenceIdx) {
+  std::vector<ByteRange> wordByteRanges;
+  wordByteRanges.reserve(response.qualityScores[sentenceIdx].wordRanges.size());
+
+  for (auto &&word : response.qualityScores[sentenceIdx].wordRanges) {
+    size_t wordBegin = response.target.wordAsByteRange(sentenceIdx, word.begin).begin;
+    size_t wordEnd = response.target.wordAsByteRange(sentenceIdx, word.end).begin;
+
+    if (std::isspace(response.target.text.at(wordBegin))) {
+      ++wordBegin;
+    }
+
+    wordByteRanges.emplace_back(ByteRange{wordBegin, wordEnd});
+  }
+
+  return wordByteRanges;
+}
+
+}  // namespace marian::bergamot
diff --git a/inference/src/translator/response.h b/inference/src/translator/response.h
new file mode 100644
index 000000000..af05e1074
--- /dev/null
+++ b/inference/src/translator/response.h
@@ -0,0 +1,85 @@
+#ifndef SRC_BERGAMOT_RESPONSE_H_
+#define SRC_BERGAMOT_RESPONSE_H_
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "annotation.h"
+#include "data/alignment.h"
+#include "data/types.h"
+#include "definitions.h"
+#include "translator/beam_search.h"
+
+namespace marian {
+namespace bergamot {
+
+typedef std::vector<std::vector<float>> Alignment;
+
+/// Response holds AnnotatedText(s) of source-text and translated text,
+/// alignment information between source and target sub-words and sentences.
+///
+/// AnnotatedText provides an API to access markings of (sub)-word and
+/// sentences boundaries, which are required to interpret Quality and
+/// Alignment (s) at the moment.
+struct Response {
+  /// SentenceQualityScore  contains the quality data of a given translated sentence.
+  /// It includes the confidence (proxied by log probabilities) of each decoded word
+  /// (higher logprobs imply better-translated words), the ByteRanges of each term,
+  /// and logprobs of the whole sentence, represented as the mean word scores.
+  struct SentenceQualityScore {
+    /// Quality score of each translated word
+    std::vector<float> wordScores;
+    /// Position of start and end token of each word in the translated text
+    std::vector<SubwordRange> wordRanges;
+    /// Whole sentence quality score (it is composed by the mean of its words)
+    float sentenceScore = 0.0;
+  };
+
+  /// Convenience function to obtain number of units translated. Same as
+  /// `.source.numSentences()` and `.target.numSentences().` The processing of a
+  /// text of into sentences are handled internally, and this information can be
+  /// used to iterate through meaningful units of translation for which
+  /// alignment and quality information are available.
+  const size_t size() const { return source.numSentences(); }
+
+  /// source text and annotations of (sub-)words and sentences.
+  AnnotatedText source;
+
+  /// translated text and annotations of (sub-)words and sentences.
+  AnnotatedText target;
+
+  /// logprob of each word and  the total sequence (sentence)
+  /// normalized by length, for each sentence processed by the translator.
+  /// Indices correspond to ranges accessible through respective Annotation on
+  /// source or target.
+  std::vector<SentenceQualityScore> qualityScores;
+
+  /// Alignments between source and target. This is a collection of dense matrices providing
+  ///    P[t][s] = p(source-token s  | target token t)
+  /// with an alignment matrix for each sentence.
+  std::vector<std::vector<std::vector<float>>> alignments;
+
+  /// Returns the source sentence (in terms of byte range) corresponding to sentenceIdx.
+  ///
+  /// @param [in] sentenceIdx: The index representing the sentence where 0 <= sentenceIdx < Response::size()
+  ByteRange getSourceSentenceAsByteRange(size_t sentenceIdx) const { return source.sentenceAsByteRange(sentenceIdx); }
+
+  /// Returns the translated sentence (in terms of byte range) corresponding to sentenceIdx.
+  ///
+  /// @param [in] sentenceIdx: The index representing the sentence where 0 <= sentenceIdx < Response::size()
+  ByteRange getTargetSentenceAsByteRange(size_t sentenceIdx) const { return target.sentenceAsByteRange(sentenceIdx); }
+
+  const std::string &getOriginalText() const { return source.text; }
+
+  const std::string &getTranslatedText() const { return target.text; }
+};
+
+std::vector<Alignment> remapAlignments(const Response &first, const Response &second);
+
+std::vector<ByteRange> getWordByteRanges(Response const &response, size_t sentenceIdx);
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  // SRC_BERGAMOT_RESPONSE_H_
diff --git a/inference/src/translator/response_builder.cpp b/inference/src/translator/response_builder.cpp
new file mode 100644
index 000000000..f752bf1a5
--- /dev/null
+++ b/inference/src/translator/response_builder.cpp
@@ -0,0 +1,71 @@
+#include "response_builder.h"
+
+#include "response_options.h"
+
+namespace marian {
+namespace bergamot {
+
+void ResponseBuilder::buildQualityScores(Histories &histories, Response &response) {
+  qualityEstimator_.computeQualityScores(histories, response);
+}
+
+void ResponseBuilder::buildAlignments(Histories &histories, Response &response) {
+  for (auto &history : histories) {
+    // TODO(jerin): Change hardcode of nBest = 1
+    NBestList onebest = history->nBest(1);
+
+    Result result = onebest[0];  // Expecting only one result;
+    Words words = std::get<0>(result);
+    auto hyp = std::get<1>(result);
+    auto softAlignment = hyp->tracebackAlignment();
+    response.alignments.push_back(std::move(softAlignment));
+  }
+}
+
+void ResponseBuilder::buildTranslatedText(Histories &histories, Response &response) {
+  // Reserving length at least as much as source_ seems like a reasonable
+  // thing to do to avoid reallocations.
+  response.target.text.reserve(response.source.text.size());
+
+  for (size_t sentenceIdx = 0; sentenceIdx < histories.size(); sentenceIdx++) {
+    // TODO(jerin): Change hardcode of nBest = 1
+
+    auto &history = histories[sentenceIdx];
+    NBestList onebest = history->nBest(1);
+
+    Result result = onebest[0];  // Expecting only one result;
+    Words words = std::get<0>(result);
+
+    std::string decoded;
+    std::vector<string_view> targetSentenceMappings;
+    vocabs_.target()->decodeWithByteRanges(words, decoded, targetSentenceMappings, /*ignoreEOS=*/false);
+
+    switch (responseOptions_.concatStrategy) {
+      case ConcatStrategy::FAITHFUL: {
+        // For each sentence, prepend the filler text between the corresponding
+        // source-sentence and the source-sentence before.
+        string_view pre = response.source.gap(sentenceIdx);
+        response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end());
+
+        // If this is the last history to be decoded and translated-text
+        // constructed, append the text till the end, which could be spaces or
+        // empty.
+        if (sentenceIdx + 1 == histories.size()) {
+          response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1));
+        }
+        break;
+      }
+      case ConcatStrategy::SPACE: {
+        string_view delimiter = (sentenceIdx == 0) ? "" : " ";
+        response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end());
+        break;
+      }
+
+      default:
+        ABORT("Unknown concat-strategy");
+    }
+  }
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/response_builder.h b/inference/src/translator/response_builder.h
new file mode 100644
index 000000000..f6eaa35d3
--- /dev/null
+++ b/inference/src/translator/response_builder.h
@@ -0,0 +1,102 @@
+#ifndef SRC_BERGAMOT_RESPONSE_BUILDER_H_
+#define SRC_BERGAMOT_RESPONSE_BUILDER_H_
+
+#include <optional>
+
+#include "data/types.h"
+#include "html.h"
+#include "quality_estimator.h"
+#include "response.h"
+#include "response_options.h"
+#include "vocabs.h"
+
+// For now we will work with this, to avoid complaints another structure is hard
+// to operate with.
+
+namespace marian {
+namespace bergamot {
+
+/// ResponseBuilder is a callback functor. It is expected to be bound to a
+/// Request after giving it the context of options, vocabs and promise to set.
+/// It constructs the Response and it's members based on options
+/// (quality=on|off, alignments=on|off, mappings=on|off, splitmode=sentence |
+/// paragraph).
+
+class ResponseBuilder {
+ public:
+  /// @param [in] responseOptions: ResponseOptions, indicating what to include
+  /// or not in the response and any additional configurable parameters.
+  /// @param [in] vocabs: marian vocab object (used in decoding)
+  /// @param [in] callback: callback with operates on the constructed Response.
+  /// @param [in] qualityEstimator: the QualityEstimator model that can be used
+  /// to provide translation quality probability.
+  ResponseBuilder(ResponseOptions responseOptions, AnnotatedText &&source, const Vocabs &vocabs,
+                  std::function<void(Response &&)> callback, const QualityEstimator &qualityEstimator)
+      : responseOptions_(responseOptions),
+        source_(std::move(source)),
+        vocabs_(vocabs),
+        callback_(std::move(callback)),
+        qualityEstimator_(qualityEstimator) {}
+
+  /// Constructs and sets the promise of a Response object from obtained
+  /// histories after translating.
+  /// @param [in] histories: Histories obtained after translating the Request
+  /// from which this functor is called.
+  void operator()(Histories &&histories) {
+    // TODO(jerinphilip) load ResponseOptions into options and turn build
+    // functions on or off.
+    // responseOptions_ is unused, but we can try something here.
+    ABORT_IF(source_.numSentences() != histories.size(), "Mismatch in source and translated sentences");
+    Response response;
+
+    // Move source_ into response.
+    response.source = std::move(source_);
+
+    // Should be after source is set
+    buildTranslatedText(histories, response);
+
+    // Should always be after buildTranslatedText
+    if (responseOptions_.qualityScores) {
+      buildQualityScores(histories, response);
+    }
+
+    if (responseOptions_.alignment || responseOptions_.HTML) {
+      buildAlignments(histories, response);
+    }
+
+    callback_(std::move(response));
+  }
+
+ private:
+  /// Builds qualityScores from histories and writes to response. expects
+  /// buildTranslatedText to be run before to be able to obtain target text and
+  /// subword information.
+  /// @param histories [in]
+  /// @param response [out]
+  void buildQualityScores(Histories &histories, Response &response);
+
+  /// Builds alignments from histories and writes onto response.
+  /// @param histories [in]
+  /// @param response [out]
+  void buildAlignments(Histories &histories, Response &response);
+
+  /// Builds translated text and subword annotations and writes onto response.
+  /// @param histories [in]
+  /// @param response [out]
+  void buildTranslatedText(Histories &histories, Response &response);
+
+  // Data members are context/curried args for the functor.
+
+  ResponseOptions responseOptions_;
+  const Vocabs &vocabs_;                       // vocabs are required for decoding
+                                               // and any source validation checks.
+  std::function<void(Response &&)> callback_;  //  To be set when callback triggered and
+                                               //  after Response constructed.
+  AnnotatedText source_;
+
+  const QualityEstimator &qualityEstimator_;
+};
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  //  SRC_BERGAMOT_RESPONSE_BUILDER_H_
diff --git a/inference/src/translator/response_options.h b/inference/src/translator/response_options.h
new file mode 100644
index 000000000..8ccfab856
--- /dev/null
+++ b/inference/src/translator/response_options.h
@@ -0,0 +1,35 @@
+#ifndef SRC_BERGAMOT_RESPONSE_OPTIONS_H_
+#define SRC_BERGAMOT_RESPONSE_OPTIONS_H_
+#include <string>
+
+namespace marian {
+namespace bergamot {
+
+enum ConcatStrategy {
+  /// Target text is constructed faithful to the source-text  structure.
+  FAITHFUL,
+
+  /// Target text is concatenated by a space.
+  SPACE
+};
+
+/// ResponseOptions dictate how to construct a Response for an input string of
+/// text to be translated.
+struct ResponseOptions {
+  bool qualityScores{false};  ///< Include quality-scores or not.
+  bool alignment{false};      ///< Include alignments or not.
+
+  bool HTML{false};  /// Remove HTML tags from text and insert in output.
+
+  /// Whether to include sentenceMappings or not. Alignments require
+  /// sentenceMappings and are available irrespective of this option if
+  /// `alignment=true`.
+  bool sentenceMappings{false};
+
+  ConcatStrategy concatStrategy{ConcatStrategy::FAITHFUL};
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  //  SRC_BERGAMOT_RESPONSE_OPTIONS_H_
diff --git a/inference/src/translator/service.cpp b/inference/src/translator/service.cpp
new file mode 100644
index 000000000..32cd023c5
--- /dev/null
+++ b/inference/src/translator/service.cpp
@@ -0,0 +1,223 @@
+#include "service.h"
+
+#include <string>
+#include <utility>
+
+#include "batch.h"
+#include "byte_array_util.h"
+#include "definitions.h"
+
+namespace marian {
+namespace bergamot {
+
+namespace {
+
+// Combines two responses with first.target == second.source mapping alignments etc accordingly.
+// There are several constraints which are matched by only the pivoting workflow in <>Service source, therefore this
+// function is not for external use and in a hidden namespace.
+Response combine(Response &&first, Response &&second) {
+  Response combined;
+
+  // Compute alignment first using internal matrices and mappings.
+  if (first.alignments.size()) {
+    combined.alignments = remapAlignments(first, second);
+  }
+
+  combined.source = std::move(first.source);
+  combined.target = std::move(second.target);
+  combined.qualityScores = std::move(second.qualityScores);
+
+  return combined;
+}
+
+std::optional<TranslationCache> makeOptionalCache(size_t size, size_t mutexBuckets) {
+  return size > 0 ? std::make_optional<TranslationCache>(size, mutexBuckets) : std::nullopt;
+}
+
+}  // namespace
+
+BlockingService::BlockingService(const BlockingService::Config &config)
+    : config_(config),
+      requestId_(0),
+      batchingPool_(),
+      cache_(makeOptionalCache(config.cacheSize, /*mutexBuckets = */ 1)),
+      logger_(config.logger) {}
+
+std::vector<Response> BlockingService::translateMultiple(std::shared_ptr<TranslationModel> translationModel,
+                                                         std::vector<std::string> &&sources,
+                                                         const std::vector<ResponseOptions> &responseOptions) {
+  std::vector<HTML> htmls;
+  for (size_t i = 0; i < sources.size(); i++) {
+    htmls.emplace_back(std::move(sources[i]), responseOptions[i].HTML);
+  }
+  std::vector<Response> responses = translateMultipleRaw(translationModel, std::move(sources), responseOptions);
+  for (size_t i = 0; i < responses.size(); i++) {
+    htmls[i].restore(responses[i]);
+  }
+
+  return responses;
+}
+
+std::vector<Response> BlockingService::translateMultipleRaw(std::shared_ptr<TranslationModel> translationModel,
+                                                            std::vector<std::string> &&sources,
+                                                            const std::vector<ResponseOptions> &responseOptions) {
+  std::vector<Response> responses;
+  responses.resize(sources.size());
+
+  for (size_t i = 0; i < sources.size(); i++) {
+    auto callback = [i, &responses](Response &&response) { responses[i] = std::move(response); };  //
+    Ptr<Request> request =
+        translationModel->makeRequest(requestId_++, std::move(sources[i]), callback, responseOptions[i], cache_);
+    batchingPool_.enqueueRequest(translationModel, request);
+  }
+
+  Batch batch;
+  Ptr<TranslationModel> model{nullptr};
+  while (batchingPool_.generateBatch(model, batch)) {
+    model->translateBatch(/*deviceId=*/0, batch);
+  }
+
+  return responses;
+}
+
+std::vector<Response> BlockingService::pivotMultiple(std::shared_ptr<TranslationModel> first,
+                                                     std::shared_ptr<TranslationModel> second,
+                                                     std::vector<std::string> &&sources,
+                                                     const std::vector<ResponseOptions> &responseOptions) {
+  std::vector<HTML> htmls;
+  for (size_t i = 0; i < sources.size(); i++) {
+    htmls.emplace_back(std::move(sources[i]), responseOptions[i].HTML);
+  }
+
+  // Translate source to pivots. This is same as calling translateMultiple.
+  std::vector<Response> sourcesToPivots;
+  sourcesToPivots = translateMultipleRaw(first, std::move(sources), responseOptions);
+
+  // Translate pivots to targets, after we have outputs at pivot from first round. We cannot use translateMultiple here
+  // because need consistency at pivot on both sides.
+  std::vector<Response> pivotsToTargets;
+  pivotsToTargets.resize(sourcesToPivots.size());
+
+  for (size_t i = 0; i < sourcesToPivots.size(); i++) {
+    AnnotatedText intermediate =
+        sourcesToPivots[i].target;  // We cannot eliminate this copy, as we need two versions of intermediate. Holding
+                                    // it in allows further use in makePivotRequest
+    auto callback = [i, &pivotsToTargets](Response &&response) { pivotsToTargets[i] = std::move(response); };  //
+
+    Ptr<Request> request =
+        second->makePivotRequest(requestId_++, std::move(intermediate), callback, responseOptions[i], cache_);
+    batchingPool_.enqueueRequest(second, request);
+  }
+
+  Batch batch;
+  Ptr<TranslationModel> model{nullptr};
+  while (batchingPool_.generateBatch(model, batch)) {
+    model->translateBatch(/*deviceId=*/0, batch);
+  }
+
+  // Combine both sides. They're associated by indices.
+  std::vector<Response> finalResponses;
+  for (size_t i = 0; i < sourcesToPivots.size(); i++) {
+    Response finalResponse = combine(std::move(sourcesToPivots[i]), std::move(pivotsToTargets[i]));
+    finalResponses.push_back(std::move(finalResponse));
+  }
+
+  for (size_t i = 0; i < finalResponses.size(); i++) {
+    htmls[i].restore(finalResponses[i]);
+  }
+
+  return finalResponses;
+}
+
+AsyncService::AsyncService(const AsyncService::Config &config)
+    : requestId_(0),
+      config_(config),
+      safeBatchingPool_(),
+      cache_(makeOptionalCache(config_.cacheSize, /*mutexBuckets=*/config_.numWorkers)),
+      logger_(config.logger) {
+  ABORT_IF(config_.numWorkers == 0, "Number of workers should be at least 1 in a threaded workflow");
+  workers_.reserve(config_.numWorkers);
+  for (size_t cpuId = 0; cpuId < config_.numWorkers; cpuId++) {
+    workers_.emplace_back([cpuId, this] {
+      // Consumer thread main-loop. Note that this is an infinite-loop unless the monitor is explicitly told to
+      // shutdown, which happens in the destructor for this class.
+      Batch batch;
+      Ptr<TranslationModel> translationModel{nullptr};
+      while (safeBatchingPool_.generateBatch(translationModel, batch)) {
+        translationModel->translateBatch(cpuId, batch);
+      }
+    });
+  }
+}
+
+void AsyncService::clear() { safeBatchingPool_.clear(); }
+
+AsyncService::~AsyncService() {
+  safeBatchingPool_.shutdown();
+  for (std::thread &worker : workers_) {
+    assert(worker.joinable());
+    worker.join();
+  }
+  workers_.clear();
+}
+
+void AsyncService::pivot(std::shared_ptr<TranslationModel> first, std::shared_ptr<TranslationModel> second,
+                         std::string &&source, CallbackType clientCallback, const ResponseOptions &responseOptions) {
+  Ptr<HTML> html = std::make_shared<HTML>(std::move(source), responseOptions.HTML);
+  // This is callback chaining or CPS due to async.
+
+  // We create a callback which feeds the result of first into a second translation (internalCallback), which is
+  // supplied with a callback (joiningCallback) which merges both results and creates our final response.
+  //
+
+  auto internalCallback = [this, clientCallback, second, responseOptions, html](Response &&sourceToPivot) {
+    // We cannot eliminate the following copy, as we need two versions of intermediate. Holding
+    // it in a copy allows moving the response into the lambda below.
+
+    AnnotatedText intermediate = sourceToPivot.target;
+
+    // https://stackoverflow.com/a/65606554/4565794
+    // Move semantics only work on mutable lambdas, and can only be done once. It's only once in our case, so issok.
+    auto joiningCallback = [this, sourceToPivot = std::move(sourceToPivot), clientCallback,
+                            html](Response &&pivotToTarget) mutable {
+      // We have both Responses at this callback, sourceToPivot is moved in, second half will be available when
+      // complete.
+      Response finalResponse = combine(std::move(sourceToPivot), std::move(pivotToTarget));
+
+      // Sentences should be consistent now, give way to client.
+      html->restore(finalResponse);
+      clientCallback(std::move(finalResponse));
+    };
+
+    // Second call.
+    Ptr<Request> request =
+        second->makePivotRequest(requestId_++, std::move(intermediate), joiningCallback, responseOptions, cache_);
+    safeBatchingPool_.enqueueRequest(second, request);
+  };
+
+  // First call.
+  translateRaw(first, std::move(source), internalCallback, responseOptions);
+}
+
+void AsyncService::translate(std::shared_ptr<TranslationModel> translationModel, std::string &&source,
+                             CallbackType callback, const ResponseOptions &responseOptions) {
+  // Producer thread, a call to this function adds new work items. If batches are available, notifies workers waiting.
+  Ptr<HTML> html = std::make_shared<HTML>(std::move(source), responseOptions.HTML);
+  auto internalCallback = [html, callback](Response &&response) {
+    html->restore(response);
+    callback(std::move(response));
+  };
+
+  translateRaw(translationModel, std::move(source), internalCallback, responseOptions);
+}
+
+void AsyncService::translateRaw(std::shared_ptr<TranslationModel> translationModel, std::string &&source,
+                                CallbackType callback, const ResponseOptions &responseOptions) {
+  // Producer thread, a call to this function adds new work items. If batches are available, notifies workers waiting.
+  Ptr<Request> request =
+      translationModel->makeRequest(requestId_++, std::move(source), callback, responseOptions, cache_);
+  safeBatchingPool_.enqueueRequest(translationModel, request);
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/service.h b/inference/src/translator/service.h
new file mode 100644
index 000000000..1e4c9bab1
--- /dev/null
+++ b/inference/src/translator/service.h
@@ -0,0 +1,198 @@
+#ifndef SRC_BERGAMOT_SERVICE_H_
+#define SRC_BERGAMOT_SERVICE_H_
+
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include "cache.h"
+#include "data/types.h"
+#include "logging.h"
+#include "quality_estimator.h"
+#include "response.h"
+#include "response_builder.h"
+#include "text_processor.h"
+#include "threadsafe_batching_pool.h"
+#include "translation_model.h"
+#include "translator/parser.h"
+#include "vocabs.h"
+
+namespace marian {
+namespace bergamot {
+
+class BlockingService;
+class AsyncService;
+
+/// See AsyncService.
+///
+/// BlockingService is a not-threaded counterpart of AsyncService which can operate only in a blocking workflow (queue a
+/// bunch of texts and optional args to translate, wait till the translation finishes).
+class BlockingService {
+ public:
+  struct Config {
+    /// Size in History items to be stored in the cache. A value of 0 means no caching. Loosely corresponds to sentences
+    /// to cache in the real world. Note that cache has a random-eviction policy. The peak storage at full occupancy is
+    /// controlled by this parameter. However, whether we attain full occupancy or not is controlled by random factors -
+    /// specifically how uniformly the hash distributes.
+    size_t cacheSize{0};
+
+    Logger::Config logger;  ///< Configurations for logging
+
+    template <class App>
+    static void addOptions(App &app, Config &config) {
+      // Options will come here.
+      app.add_option("--cache-size", config.cacheSize, "Number of entries to store in cache.");
+      Logger::Config::addOptions(app, config.logger);
+    }
+  };
+  /// Construct a BlockingService with configuration loaded from an Options object. Does not require any keys, values to
+  /// be set.
+  BlockingService(const BlockingService::Config &config);
+
+  /// Translate multiple text-blobs in a single *blocking* API call, providing ResponseOptions which applies across all
+  /// text-blobs dictating how to construct Response. ResponseOptions can be used to enable/disable additional
+  /// information like quality-scores, alignments etc.
+
+  /// If you have async/multithread capabilities, it is recommended to work with AsyncService instead of this class.
+  /// Note that due to batching differences and consequent floating-point rounding differences, this is not guaranteed
+  /// to have the same output as AsyncService.
+
+  /// @param [in] translationModel: TranslationModel to use for the request.
+  /// @param [in] source: rvalue reference of the string to be translated
+  /// @param [in] responseOptions: ResponseOptions per source-item indicating whether or not to include some member in
+  /// the Response, also specify any additional configurable parameters.
+  std::vector<Response> translateMultiple(std::shared_ptr<TranslationModel> translationModel,
+                                          std::vector<std::string> &&source,
+                                          const std::vector<ResponseOptions> &responseOptions);
+
+  /// With the supplied two translation models, translate using first and then the second generating a response as if it
+  /// were translated from first's source language to second's target langauge. Requires first's target to be second's
+  /// source to work correctly - effectively implementing pivoting translation via an intermediate language.
+  ///
+  /// @param[in] first: TranslationModel capable of translating from source language to pivot language.
+  /// @param[in] second: TranslationModel capable of translating between pivot and target language.
+  /// @param[move] sources: The input source texts to be translated.
+  /// @param[in] options: Options indicating whether or not to include optional members per source-text. See
+  /// ResponseOptions.
+  ///
+  /// @returns responses corresponding to the source-text which can be used as if they were translated with
+  /// translateMultiple.
+  std::vector<Response> pivotMultiple(std::shared_ptr<TranslationModel> first, std::shared_ptr<TranslationModel> second,
+                                      std::vector<std::string> &&sources,
+                                      const std::vector<ResponseOptions> &responseOptions);
+  TranslationCache::Stats cacheStats() { return cache_ ? cache_->stats() : TranslationCache::Stats(); }
+
+ private:
+  std::vector<Response> translateMultipleRaw(std::shared_ptr<TranslationModel> translationModel,
+                                             std::vector<std::string> &&source,
+                                             const std::vector<ResponseOptions> &responseOptions);
+
+  ///  Numbering requests processed through this instance. Used to keep account of arrival times of the request. This
+  ///  allows for using this quantity in priority based ordering.
+  size_t requestId_;
+
+  /// An aggregate batching pool associated with an async translating instance, which maintains an aggregate queue of
+  /// requests compiled from  batching-pools of multiple translation models. Not thread-safe.
+  AggregateBatchingPool batchingPool_;
+
+  Config config_;
+
+  // Logger which shuts down cleanly with service.
+  Logger logger_;
+  std::optional<TranslationCache> cache_;
+};
+
+/// Effectively a threadpool, providing an API to take a translation request of a source-text, paramaterized by
+/// TranslationModel to be used for translation. Configurability on optional items for the Response corresponding to a
+/// request is provisioned through ResponseOptions.
+class AsyncService {
+ public:
+  struct Config {
+    size_t numWorkers{1};   ///< How many worker translation threads to spawn.
+    size_t cacheSize{0};    ///< Size in History items to be stored in the cache. Loosely corresponds to sentences to
+                            /// cache in the real world. A value of 0 means no caching.
+    Logger::Config logger;  // Configurations for logging
+
+    template <class App>
+    static void addOptions(App &app, Config &config) {
+      app.add_option("--cpu-threads", config.numWorkers, "Workers to form translation backend");
+      app.add_option("--cache-size", config.cacheSize, "Number of entries to store in cache.");
+      Logger::Config::addOptions(app, config.logger);
+    }
+  };
+  /// Construct an AsyncService with configuration loaded from Options. Expects positive integer value for
+  /// `cpu-threads`. Additionally requires options which configure AggregateBatchingPool.
+  AsyncService(const AsyncService::Config &config);
+
+  /// Create a TranslationModel compatible with this instance of Service. Internally assigns how many replicas of
+  /// backend needed based on worker threads set. See TranslationModel for documentation on other params.
+  Ptr<TranslationModel> createCompatibleModel(const TranslationModel::Config &config) {
+    // @TODO: Remove this remove this dependency/coupling.
+    return New<TranslationModel>(config, /*replicas=*/config_.numWorkers);
+  }
+
+  /// With the supplied TranslationModel, translate an input. A Response is constructed with optional items set/unset
+  /// indicated via ResponseOptions. Upon completion translation of the input, the client supplied callback is
+  /// triggered with the constructed Response. Concurrent-calls to this function are safe.
+  ///
+  /// @param [in] translationModel: TranslationModel to use for the request.
+  /// @param [in] source: rvalue reference of the string to be translated. This is available as-is to the client later
+  /// in the Response corresponding to this call along with the translated-text and meta-data.
+  /// @param [in] callback: A callback function provided by the client which accepts an rvalue of a Response.
+  /// @param [in] responseOptions: Options indicating whether or not to include some member in the Response, also
+  /// specify any additional configurable parameters.
+  void translate(std::shared_ptr<TranslationModel> translationModel, std::string &&source, CallbackType callback,
+                 const ResponseOptions &options = ResponseOptions());
+
+  /// With the supplied two translation models, translate using first and then the second generating a response as if it
+  /// were translated from first's source language to second's target langauge. Requires first's target to be second's
+  /// source to work correctly - effectively implementing pivoting translation via an intermediate language.
+  ///
+  /// @param[in] first: TranslationModel capable of translating from source language to pivot language.
+  /// @param[in] second: TranslationModel capable of translating between pivot and target language.
+  /// @param[move] source: The source text to be translated
+  /// @param[in] clientCallback: The callback to be called with the constructed Response. Expects the callback to
+  /// consume the Response.
+  /// @param[in] options: Options indicating whether or not to include optional members in response and pass additional
+  /// configurations. See ResponseOptions.
+  void pivot(std::shared_ptr<TranslationModel> first, std::shared_ptr<TranslationModel> second, std::string &&source,
+             CallbackType clientCallback, const ResponseOptions &options = ResponseOptions());
+
+  /// Clears all pending requests.
+  void clear();
+
+  /// Thread joins and proper shutdown are required to be handled explicitly.
+  /// If you do not want to wait, call `clear()` before destructor.
+  ~AsyncService();
+
+  TranslationCache::Stats cacheStats() { return cache_ ? cache_->stats() : TranslationCache::Stats(); }
+
+ private:
+  void translateRaw(std::shared_ptr<TranslationModel> translationModel, std::string &&source, CallbackType callback,
+                    const ResponseOptions &options = ResponseOptions());
+
+  AsyncService::Config config_;
+
+  std::vector<std::thread> workers_;
+
+  /// Stores requestId of active request. Used to establish
+  /// ordering among requests and logging/book-keeping.
+
+  /// Numbering requests processed through this instance. Used to keep account of arrival times of the request. This
+  /// allows for using this quantity in priority based ordering.
+  size_t requestId_;
+
+  /// An aggregate batching pool associated with an async translating instance, which maintains an aggregate queue of
+  /// requests compiled from  batching-pools of multiple translation models. The batching pool is wrapped around one
+  /// object for thread-safety.
+  ThreadsafeBatchingPool<AggregateBatchingPool> safeBatchingPool_;
+
+  // Logger which shuts down cleanly with service.
+  Logger logger_;
+  std::optional<TranslationCache> cache_;
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  // SRC_BERGAMOT_SERVICE_H_
diff --git a/inference/src/translator/text_processor.cpp b/inference/src/translator/text_processor.cpp
new file mode 100644
index 000000000..e687f3ca3
--- /dev/null
+++ b/inference/src/translator/text_processor.cpp
@@ -0,0 +1,176 @@
+#include "text_processor.h"
+
+#include <vector>
+
+#include "annotation.h"
+#include "common/cli_helper.h"
+#include "common/options.h"
+#include "data/types.h"
+#include "definitions.h"
+
+namespace marian {
+namespace bergamot {
+
+namespace {
+ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m) {
+  typedef ug::ssplit::SentenceStream::splitmode splitmode;
+  if (m == "sentence") {
+    return splitmode::one_sentence_per_line;
+  } else if (m == "paragraph") {
+    return splitmode::one_paragraph_per_line;
+  } else if (m == "wrapped_text") {
+    return splitmode::wrapped_text;
+  } else {
+    ABORT("Unknown ssplitmode {}, Please choose one of {sentence,paragraph,wrapped_text}");
+  }
+}
+
+ug::ssplit::SentenceSplitter loadSplitter(const std::string &ssplitPrefixFile) {
+  // Temporarily supports empty, will be removed when mozilla passes ssplitPrefixFile
+  ug::ssplit::SentenceSplitter splitter;
+  if (ssplitPrefixFile.size()) {
+    std::string interpSsplitPrefixFile = marian::cli::interpolateEnvVars(ssplitPrefixFile);
+    LOG(info, "Loading protected prefixes for sentence splitting from {}", interpSsplitPrefixFile);
+    splitter.load(interpSsplitPrefixFile);
+  } else {
+    LOG(warn,
+        "Missing list of protected prefixes for sentence splitting. "
+        "Set with --ssplit-prefix-file.");
+  }
+  return splitter;
+}
+
+ug::ssplit::SentenceSplitter loadSplitter(const AlignedMemory &memory) {
+  // Temporarily supports empty, will be removed when mozilla passes memory
+  ug::ssplit::SentenceSplitter splitter;
+  if (memory.size()) {
+    std::string_view serialized(memory.begin(), memory.size());
+    splitter.loadFromSerialized(serialized);
+  }
+  return splitter;
+}
+
+}  // namespace
+
+Segment TextProcessor::tokenize(const string_view &segment, std::vector<string_view> &wordRanges) const {
+  // vocabs_->sources().front() is invoked as we currently only support one source vocab
+  return vocabs_.sources().front()->encodeWithByteRanges(segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
+}
+
+TextProcessor::TextProcessor(Ptr<Options> options, const Vocabs &vocabs, const std::string &ssplit_prefix_file)
+    : vocabs_(vocabs), ssplit_(loadSplitter(ssplit_prefix_file)) {
+  parseCommonOptions(options);
+}
+
+TextProcessor::TextProcessor(Ptr<Options> options, const Vocabs &vocabs, const AlignedMemory &memory)
+    : vocabs_(vocabs) {
+  // This is not the best of the solutions at the moment, but is consistent with what happens among other structures
+  // like model, vocabulary or shortlist. First, we check if the bytearray is empty. If not, we load from ByteArray. In
+  // case empty, the string based loader which reads from file is called. However, ssplit allows for not supplying
+  // ssplit-prefix-file where-in the purely regular expression based splitter is activated.
+  //
+  // For now, we allow not supplying an ssplit-prefix-file.
+
+  if (memory.begin() == nullptr && memory.size()) {
+    ssplit_ = loadSplitter(memory);
+  } else {
+    ssplit_ = loadSplitter(options->get<std::string>("ssplit-prefix-file", ""));
+  }
+  parseCommonOptions(options);
+}
+
+void TextProcessor::parseCommonOptions(Ptr<Options> options) {
+  maxLengthBreak_ = options->get<size_t>("max-length-break");
+  ssplitMode_ = string2splitmode(options->get<std::string>("ssplit-mode"));
+}
+
+void TextProcessor::process(std::string &&input, AnnotatedText &source, Segments &segments) const {
+  source = std::move(AnnotatedText(std::move(input)));
+  std::string_view input_converted(source.text.data(), source.text.size());
+  auto sentenceStream = ug::ssplit::SentenceStream(input_converted, ssplit_, ssplitMode_);
+
+  std::string_view sentenceStringPiece;
+
+  while (sentenceStream >> sentenceStringPiece) {
+    marian::string_view sentence(sentenceStringPiece.data(), sentenceStringPiece.size());
+
+    std::vector<string_view> wordRanges;
+    Segment segment = tokenize(sentence, wordRanges);
+
+    // There are some cases where SentencePiece or vocab returns no words
+    // after normalization. 0 prevents any empty entries from being added.
+    if (segment.size() > 0) {
+      // Wrap segment into sentences of at most maxLengthBreak_ tokens and
+      // tell source about them.
+      wrap(segment, wordRanges, segments, source);
+    }
+  }
+}
+
+void TextProcessor::wrap(Segment &segment, std::vector<string_view> &wordRanges, Segments &segments,
+                         AnnotatedText &source) const {
+  // There's an EOS token added to the words, manually. SentencePiece/marian-vocab is set to not append EOS. Marian
+  // requires EOS to be at the end as a marker to start translating. So while we're supplied maxLengthBreak_ from
+  // outside, we need to ensure there's space for EOS in each wrapped segment.
+  Word sourceEosId = vocabs_.sources().front()->getEosId();
+  size_t wrapStep = maxLengthBreak_ - 1;
+
+  for (size_t offset = 0; offset < segment.size(); offset += wrapStep) {
+    auto start = segment.begin() + offset;
+
+    // Restrict the range within bounds.
+    size_t left = segment.size() - offset;
+    size_t diff = std::min(wrapStep, left);
+
+    segments.emplace_back(start, start + diff);
+    segments.back().push_back(sourceEosId);
+
+    auto astart = wordRanges.begin() + offset;
+
+    // Construct a part vector of string_view representing wrapped segment, use the last string_view to create an EOS
+    // string_view manually.
+    std::vector<string_view> partWordRanges(astart, astart + diff);
+    string_view &last = partWordRanges.back();
+    const char *end = last.data() + last.size();
+    partWordRanges.emplace_back(end, 0);
+    // diff > 0
+    source.recordExistingSentence(partWordRanges.begin(), partWordRanges.end(), astart->data());
+  }
+}
+
+void TextProcessor::processFromAnnotation(AnnotatedText &source, Segments &segments) const {
+  std::string copySource = source.text;
+  AnnotatedText replacement(std::move(copySource));
+
+  for (size_t s = 0; s < source.numSentences(); s++) {
+    // This is our sentenceStream
+    ByteRange sentenceByteRange = source.sentenceAsByteRange(s);
+
+    // Fool tokenization using ByteRanges into looking at replacement. They're same, so okay.
+    marian::string_view sentence{&replacement.text[sentenceByteRange.begin], sentenceByteRange.size()};
+
+    std::vector<string_view> wordRanges;
+    Segment segment = tokenize(sentence, wordRanges);
+
+    // Manually add EoS
+    Word sourceEosId = vocabs_.sources().front()->getEosId();
+    segment.push_back(sourceEosId);
+
+    if (!wordRanges.empty()) {
+      string_view &last = wordRanges.back();  // this is a possible segfault if wordRanges is empty. So guard.
+      const char *end = last.data() + last.size();
+      wordRanges.emplace_back(end, 0);
+    } else {
+      const char *end = sentence.data() + sentence.size();
+      wordRanges.emplace_back(end, 0);
+    }
+
+    segments.push_back(std::move(segment));
+    replacement.recordExistingSentence(wordRanges.begin(), wordRanges.end(), wordRanges.begin()->data());
+  }
+
+  source = replacement;
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/text_processor.h b/inference/src/translator/text_processor.h
new file mode 100644
index 000000000..1bde49391
--- /dev/null
+++ b/inference/src/translator/text_processor.h
@@ -0,0 +1,77 @@
+#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
+#define SRC_BERGAMOT_TEXT_PROCESSOR_H_
+
+#include <vector>
+
+#include "aligned.h"
+#include "annotation.h"
+#include "data/types.h"
+#include "data/vocab.h"
+#include "definitions.h"
+#include "ssplit.h"
+#include "vocabs.h"
+
+namespace marian {
+namespace bergamot {
+
+class TextProcessor {
+  /// TextProcessor handles loading the sentencepiece vocabulary and also
+  /// contains an instance of sentence-splitter based on ssplit.
+  ///
+  /// Used in Service to convert an incoming blob of text to a vector of
+  /// sentences (vector of words). In addition, the ByteRanges of the
+  /// source-tokens in unnormalized text are provided as string_views.
+ public:
+  // There are two ways to construct text-processor, different in a file-system
+  // based prefix file load and a memory based prefix file store. @jerinphilip
+  // is not doing magic inference inside to determine file-based or memory
+  // based on one being empty or not.
+
+  /// Construct TextProcessor from options, vocabs and prefix-file.
+  /// @param [in] options: expected to contain `max-length-break`, `ssplit-mode`.
+  /// @param [in] vocabs: Vocabularies used to process text into sentences to marian::Words and corresponding ByteRange
+  /// information in AnnotatedText.
+  /// @param [in] ssplit_prefix_file: Path to ssplit-prefix file compatible with moses-tokenizer.
+  TextProcessor(Ptr<Options>, const Vocabs &vocabs, const std::string &ssplit_prefix_file);
+
+  /// Construct TextProcessor from options, vocabs and prefix-file supplied as a bytearray. For other parameters, see
+  /// the path based constructor.
+  /// Note: This falls back to string based loads if memory is null, this behaviour will be deprecated in the future.
+  ///
+  /// @param [in] memory: ssplit-prefix-file contents in memory, passed as a bytearray.
+  TextProcessor(Ptr<Options>, const Vocabs &vocabs, const AlignedMemory &memory);
+
+  /// Wrap into sentences of at most maxLengthBreak_ tokens and add to source.
+  /// @param [in] blob: Input blob, will be bound to source and annotations on it stored.
+  /// @param [out] source: AnnotatedText instance holding input and annotations of sentences and pieces
+  /// @param [out] segments: marian::Word equivalents of the sentences processed and stored in AnnotatedText for
+  /// consumption of marian translation pipeline.
+
+  void process(std::string &&blob, AnnotatedText &source, Segments &segments) const;
+
+  void processFromAnnotation(AnnotatedText &source, Segments &segments) const;
+
+ private:
+  void parseCommonOptions(Ptr<Options> options);
+
+  /// Tokenizes an input string, returns Words corresponding. Loads the
+  /// corresponding byte-ranges into tokenRanges.
+  Segment tokenize(const string_view &input, std::vector<string_view> &tokenRanges) const;
+
+  /// Wrap into sentences of at most maxLengthBreak_ tokens and add to source.
+  void wrap(Segment &sentence, std::vector<string_view> &tokenRanges, Segments &segments, AnnotatedText &source) const;
+
+  const Vocabs &vocabs_;   ///< Vocabularies used to tokenize a sentence
+  size_t maxLengthBreak_;  ///< Parameter used to wrap sentences to a maximum number of tokens
+
+  /// SentenceSplitter compatible with moses sentence-splitter
+  ug::ssplit::SentenceSplitter ssplit_;
+
+  /// Mode of splitting, can be line ('\n') based, paragraph based, also supports a wrapped mode.
+  ug::ssplit::SentenceStream::splitmode ssplitMode_;
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  // SRC_BERGAMOT_TEXT_PROCESSOR_H_
diff --git a/inference/src/translator/threadsafe_batching_pool.cpp b/inference/src/translator/threadsafe_batching_pool.cpp
new file mode 100644
index 000000000..29ad35a97
--- /dev/null
+++ b/inference/src/translator/threadsafe_batching_pool.cpp
@@ -0,0 +1,56 @@
+
+#ifndef SRC_BERGAMOT_THREADSAFE_BATCHING_POOL_IMPL
+#error "This is an impl file and must not be included directly!"
+#endif
+
+#include <cassert>
+
+namespace marian {
+namespace bergamot {
+
+template <class BatchingPoolType>
+template <class... Args>
+ThreadsafeBatchingPool<BatchingPoolType>::ThreadsafeBatchingPool(Args &&...args)
+    : backend_(std::forward<Args>(args)...), enqueued_(0), shutdown_(false) {}
+
+template <class BatchingPoolType>
+ThreadsafeBatchingPool<BatchingPoolType>::~ThreadsafeBatchingPool() {
+  shutdown();
+}
+
+template <class BatchingPoolType>
+template <class... Args>
+void ThreadsafeBatchingPool<BatchingPoolType>::enqueueRequest(Args &&...args) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  assert(!shutdown_);
+  enqueued_ += backend_.enqueueRequest(std::forward<Args>(args)...);
+  work_.notify_all();
+}
+
+template <class BatchingPoolType>
+void ThreadsafeBatchingPool<BatchingPoolType>::clear() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  backend_.clear();
+  enqueued_ = 0;
+}
+
+template <class BatchingPoolType>
+void ThreadsafeBatchingPool<BatchingPoolType>::shutdown() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  shutdown_ = true;
+  work_.notify_all();
+}
+
+template <class BatchingPoolType>
+template <class... Args>
+size_t ThreadsafeBatchingPool<BatchingPoolType>::generateBatch(Args &&...args) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  work_.wait(lock, [this]() { return enqueued_ || shutdown_; });
+  size_t sentencesInBatch = backend_.generateBatch(std::forward<Args>(args)...);
+  assert(sentencesInBatch > 0 || shutdown_);
+  enqueued_ -= sentencesInBatch;
+  return sentencesInBatch;
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/threadsafe_batching_pool.h b/inference/src/translator/threadsafe_batching_pool.h
new file mode 100644
index 000000000..9f46abb94
--- /dev/null
+++ b/inference/src/translator/threadsafe_batching_pool.h
@@ -0,0 +1,77 @@
+/* Thread-safe wrapper around BatchingPool or AggregateBatchingPool, made generic with templates. */
+#ifndef SRC_BERGAMOT_THREADSAFE_BATCHING_POOL_H_
+#define SRC_BERGAMOT_THREADSAFE_BATCHING_POOL_H_
+
+#include <condition_variable>
+#include <mutex>
+
+#include "aggregate_batching_pool.h"
+#include "batching_pool.h"
+#include "common/options.h"
+#include "definitions.h"
+#include "translation_model.h"
+
+namespace marian {
+namespace bergamot {
+
+/// The following mechanism operates in a multithreaded async-workflow guarding access to the pushes to the structure
+/// keeping sentences bucketed by length and sorted by priority.
+///
+/// This is a wrap of a producer-consumer queue implemented as a monitor, where there is a mutex guarding the
+/// underlying data structure (BatchingPoolType) and (worker/consumer) threads waiting on a condition variable and the
+/// queuing thread producing and notifying waiting threads (consumers) through the same condition variable.
+///
+/// Originally written by for a single model (where items are produce: Request, consume: Batch), converted to
+/// also work for multiple models where items are produce: (TranslationModel, Request), consume: (TranlsationModel,
+/// Batch). This is accomplished by template parameter packs.
+///
+/// Requires BatchingPoolType to implement the following:
+///
+/// * produce: `size_t enqueueRequest(...)` (returns number elements produced)
+/// * consume: `size_t generateBatch(...)` (returns number of elements available to be consumed)
+
+template <class BatchingPoolType>
+class ThreadsafeBatchingPool {
+ public:
+  template <class... Args>
+  ThreadsafeBatchingPool(Args &&...args);
+  ~ThreadsafeBatchingPool();
+
+  template <class... Args>
+  void enqueueRequest(Args &&...args);
+
+  template <class... Args>
+  size_t generateBatch(Args &&...args);
+
+  // Removes any pending requests from the batching pool.
+  void clear();
+
+  // Signals shut down of batching pool. After this no new requests can be enqueued,
+  // but all enqueued requests will be processed. To prevent this from happening,
+  // call `clear()` before `shutdown()`.
+  void shutdown();
+
+ private:
+  BatchingPoolType backend_;
+
+  // Number of sentences in backend_;
+  size_t enqueued_;
+
+  // Are we shutting down?
+  bool shutdown_;
+
+  // Lock on this object.
+  std::mutex mutex_;
+
+  // Signaled when there are sentences to translate.
+  std::condition_variable work_;
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#define SRC_BERGAMOT_THREADSAFE_BATCHING_POOL_IMPL
+#include "threadsafe_batching_pool.cpp"
+#undef SRC_BERGAMOT_THREADSAFE_BATCHING_POOL_IMPL
+
+#endif  // SRC_BERGAMOT_THREADSAFE_BATCHING_POOL_H_
diff --git a/inference/src/translator/translation_model.cpp b/inference/src/translator/translation_model.cpp
new file mode 100644
index 000000000..6f8dd4dc8
--- /dev/null
+++ b/inference/src/translator/translation_model.cpp
@@ -0,0 +1,208 @@
+#include "translation_model.h"
+
+#include "batch.h"
+#include "byte_array_util.h"
+#include "cache.h"
+#include "common/logging.h"
+#include "data/corpus.h"
+#include "data/text_input.h"
+#include "html.h"
+#include "parser.h"
+#include "translator/beam_search.h"
+
+namespace marian {
+namespace bergamot {
+
+std::atomic<size_t> TranslationModel::modelCounter_ = 0;
+
+TranslationModel::TranslationModel(const Config &options, MemoryBundle &&memory /*=MemoryBundle{}*/,
+                                   size_t replicas /*=1*/)
+    : modelId_(modelCounter_++),
+      options_(options),
+      memory_(std::move(memory)),
+      vocabs_(options, std::move(memory_.vocabs)),
+      textProcessor_(options, vocabs_, std::move(memory_.ssplitPrefixFile)),
+      batchingPool_(options),
+      qualityEstimator_(createQualityEstimator(getQualityEstimatorModel(memory, options))) {
+  ABORT_IF(replicas == 0, "At least one replica needs to be created.");
+  backend_.resize(replicas);
+
+  // Try to load shortlist from memory-bundle. If not available, try to load from options_;
+
+  int srcIdx = 0, trgIdx = 1;
+  // vocabs_->sources().front() is invoked as we currently only support one source vocab
+  bool shared_vcb = (vocabs_.sources().front() == vocabs_.target());
+
+  if (memory_.shortlist.size() > 0 && memory_.shortlist.begin() != nullptr) {
+    bool check = options_->get<bool>("check-bytearray", false);
+    shortlistGenerator_ = New<data::BinaryShortlistGenerator>(memory_.shortlist.begin(), memory_.shortlist.size(),
+                                                              vocabs_.sources().front(), vocabs_.target(), srcIdx,
+                                                              trgIdx, shared_vcb, check);
+  } else if (options_->hasAndNotEmpty("shortlist")) {
+    // Changed to BinaryShortlistGenerator to enable loading binary shortlist file
+    // This class also supports text shortlist file
+    shortlistGenerator_ = New<data::BinaryShortlistGenerator>(options_, vocabs_.sources().front(), vocabs_.target(),
+                                                              srcIdx, trgIdx, shared_vcb);
+  } else {
+    // In this case, the loadpath does not load shortlist.
+    shortlistGenerator_ = nullptr;
+  }
+}
+
+void TranslationModel::loadBackend(size_t idx) {
+  auto &graph = backend_[idx].graph;
+  auto &scorerEnsemble = backend_[idx].scorerEnsemble;
+
+  marian::DeviceId device_(idx, DeviceType::cpu);
+  graph = New<ExpressionGraph>(/*inference=*/true);  // set the graph to be inference only
+  auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
+  graph->setDefaultElementType(typeFromString(prec[0]));
+  graph->setDevice(device_);
+  graph->getBackend()->configureDevice(options_);
+  graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+
+  // if memory_.models is populated, then all models were of binary format
+  if (memory_.models.size() >= 1) {
+    const std::vector<const void *> container = std::invoke([&]() {
+      std::vector<const void *> model_ptrs(memory_.models.size());
+      for (size_t i = 0; i < memory_.models.size(); ++i) {
+        const AlignedMemory &model = memory_.models[i];
+
+        ABORT_IF(model.size() == 0 || model.begin() == nullptr, "The provided memory is empty. Cannot load the model.");
+        ABORT_IF(
+            (uintptr_t)model.begin() % 256 != 0,
+            "The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it.");
+        if (options_->get<bool>("check-bytearray", false)) {
+          ABORT_IF(!validateBinaryModel(model, model.size()),
+                   "The binary file is invalid. Incomplete or corrupted download?");
+        }
+
+        model_ptrs[i] = model.begin();
+        LOG(debug, "Loaded model {} of {} from memory", (i + 1), model_ptrs.size());
+      }
+      return model_ptrs;
+    });
+
+    scorerEnsemble = createScorers(options_, container);
+  } else {
+    // load npz format models, or a mixture of binary/npz formats
+    scorerEnsemble = createScorers(options_);
+    LOG(debug, "Loaded {} model(s) from file", scorerEnsemble.size());
+  }
+
+  for (auto scorer : scorerEnsemble) {
+    scorer->init(graph);
+    if (shortlistGenerator_) {
+      scorer->setShortlistGenerator(shortlistGenerator_);
+    }
+  }
+  graph->forward();
+}
+
+// Make request process is shared between Async and Blocking workflow of translating.
+Ptr<Request> TranslationModel::makeRequest(size_t requestId, std::string &&source, CallbackType callback,
+                                           const ResponseOptions &responseOptions,
+                                           std::optional<TranslationCache> &cache) {
+  Segments segments;
+  AnnotatedText annotatedSource;
+
+  textProcessor_.process(std::move(source), annotatedSource, segments);
+  ResponseBuilder responseBuilder(responseOptions, std::move(annotatedSource), vocabs_, callback, *qualityEstimator_);
+
+  Ptr<Request> request =
+      New<Request>(requestId, /*model=*/*this, std::move(segments), std::move(responseBuilder), cache);
+  return request;
+}
+
+Ptr<Request> TranslationModel::makePivotRequest(size_t requestId, AnnotatedText &&previousTarget, CallbackType callback,
+                                                const ResponseOptions &responseOptions,
+                                                std::optional<TranslationCache> &cache) {
+  Segments segments;
+
+  textProcessor_.processFromAnnotation(previousTarget, segments);
+  ResponseBuilder responseBuilder(responseOptions, std::move(previousTarget), vocabs_, callback, *qualityEstimator_);
+
+  Ptr<Request> request = New<Request>(requestId, *this, std::move(segments), std::move(responseBuilder), cache);
+  return request;
+}
+
+Ptr<marian::data::CorpusBatch> TranslationModel::convertToMarianBatch(Batch &batch) {
+  std::vector<data::SentenceTuple> batchVector;
+  auto &sentences = batch.sentences();
+
+  size_t batchSequenceNumber{0};
+  for (auto &sentence : sentences) {
+    data::SentenceTuple sentence_tuple(batchSequenceNumber);
+    Segment segment = sentence.getUnderlyingSegment();
+    sentence_tuple.push_back(segment);
+    batchVector.push_back(sentence_tuple);
+
+    ++batchSequenceNumber;
+  }
+
+  // Usually one would expect inputs to be [B x T], where B = batch-size and T = max seq-len among the sentences in the
+  // batch. However, marian's library supports multi-source and ensembling through different source-vocabulary but same
+  // target vocabulary. This means the inputs are 3 dimensional when converted into marian's library formatted batches.
+  //
+  // Consequently B x T projects to N x B x T, where N = ensemble size. This adaptation does not fully force the idea of
+  // N = 1 (the code remains general, but N iterates only from 0-1 in the nested loop).
+
+  size_t batchSize = batchVector.size();
+
+  std::vector<size_t> sentenceIds;
+  std::vector<int> maxDims;
+
+  for (auto &example : batchVector) {
+    if (maxDims.size() < example.size()) {
+      maxDims.resize(example.size(), 0);
+    }
+    for (size_t i = 0; i < example.size(); ++i) {
+      if (example[i].size() > static_cast<size_t>(maxDims[i])) {
+        maxDims[i] = static_cast<int>(example[i].size());
+      }
+    }
+    sentenceIds.push_back(example.getId());
+  }
+
+  using SubBatch = marian::data::SubBatch;
+  std::vector<Ptr<SubBatch>> subBatches;
+  for (size_t j = 0; j < maxDims.size(); ++j) {
+    subBatches.emplace_back(New<SubBatch>(batchSize, maxDims[j], vocabs_.sources().at(j)));
+  }
+
+  std::vector<size_t> words(maxDims.size(), 0);
+  for (size_t i = 0; i < batchSize; ++i) {
+    for (size_t j = 0; j < maxDims.size(); ++j) {
+      for (size_t k = 0; k < batchVector[i][j].size(); ++k) {
+        subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
+        subBatches[j]->mask()[k * batchSize + i] = 1.f;
+        words[j]++;
+      }
+    }
+  }
+
+  for (size_t j = 0; j < maxDims.size(); ++j) {
+    subBatches[j]->setWords(words[j]);
+  }
+
+  using CorpusBatch = marian::data::CorpusBatch;
+  Ptr<CorpusBatch> corpusBatch = New<CorpusBatch>(subBatches);
+  corpusBatch->setSentenceIds(sentenceIds);
+  return corpusBatch;
+}
+
+void TranslationModel::translateBatch(size_t deviceId, Batch &batch) {
+  auto &backend = backend_[deviceId];
+
+  if (!backend.initialized) {
+    loadBackend(deviceId);
+    backend.initialized = true;
+  }
+
+  BeamSearch search(options_, backend.scorerEnsemble, vocabs_.target());
+  Histories histories = search.search(backend.graph, convertToMarianBatch(batch));
+  batch.completeBatch(histories);
+}
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/translation_model.h b/inference/src/translator/translation_model.h
new file mode 100644
index 000000000..53980b4e9
--- /dev/null
+++ b/inference/src/translator/translation_model.h
@@ -0,0 +1,137 @@
+#ifndef SRC_BERGAMOT_TRANSLATION_MODEL_H_
+#define SRC_BERGAMOT_TRANSLATION_MODEL_H_
+
+#include <string>
+#include <vector>
+
+#include "batch.h"
+#include "batching_pool.h"
+#include "byte_array_util.h"
+#include "cache.h"
+#include "common/utils.h"
+#include "data/shortlist.h"
+#include "definitions.h"
+#include "parser.h"
+#include "request.h"
+#include "text_processor.h"
+#include "translator/history.h"
+#include "translator/scorers.h"
+#include "vocabs.h"
+
+namespace marian {
+namespace bergamot {
+
+/// A TranslationModel is associated with the translation of a single language direction. Holds the graph and other
+/// structures required to run the forward pass of the neural network, along with preprocessing logic (TextProcessor)
+/// and a BatchingPool to create batches that are to be used in conjuction with an instance.
+///
+/// Thread-safety is not handled here, but the methods are available at granularity enough to be used in threaded async
+/// workflow for translation.
+
+class TranslationModel {
+ public:
+  using Config = Ptr<Options>;
+  using ShortlistGenerator = Ptr<data::ShortlistGenerator const>;
+
+  /// Equivalent to options based constructor, where `options` is parsed from string configuration. Configuration can be
+  /// JSON or YAML. Keys expected correspond to those of `marian-decoder`, available at
+  /// https://marian-nmt.github.io/docs/cmd/marian-decoder/
+  ///
+  /// Note that `replicas` is not stable. This is a temporary workaround while a more daunting task of separating
+  /// workspace from TranslationModel and binding it to threads is to be undertaken separately. Until the separation is
+  /// achieved, both TranslationModel and Service will need to be aware of workers. This is expected to be resolved
+  /// eventually, with only Service having the knowledge of how many workers are active.
+  ///
+  /// WebAssembly uses only single-thread, and we can hardcode replicas = 1 and use it anywhere and (client) needn't be
+  /// aware of this ugliness at the moment, thus providing a stable API solely for WebAssembly single-threaded modus
+  /// operandi.
+  ///
+  /// TODO(@jerinphilip): Clean this up.
+  TranslationModel(const std::string& config, MemoryBundle&& memory, size_t replicas = 1)
+      : TranslationModel(parseOptionsFromString(config, /*validate=*/false), std::move(memory), replicas){};
+
+  /// Construct TranslationModel from marian-options. If memory is empty, TranslationModel is initialized from
+  /// paths available in the options object, backed by filesystem. Otherwise, TranslationModel is initialized from the
+  /// given MemoryBundle composed of AlignedMemory holding equivalent parameters.
+  ///
+  /// @param [in] options: Marian options object.
+  /// @param [in] memory: MemoryBundle object holding memory buffers containing parameters to build MarianBackend,
+  /// ShortlistGenerator, Vocabs and SentenceSplitter.
+  TranslationModel(const Config& options, MemoryBundle&& memory, size_t replicas = 1);
+
+  TranslationModel(const Config& options, size_t replicas = 1)
+      : TranslationModel(options, getMemoryBundleFromConfig(options), replicas) {}
+
+  /// Make a Request to be translated by this TranslationModel instance.
+  /// @param [in] requestId: Unique identifier associated with this request, available from Service.
+  /// @param [in] source: Source text to be translated. Ownership is accepted and eventually returned to the client in
+  /// Response corresponding to the Request created here.
+  /// @param [in] callback: Callback (from client) to be issued upon completion of translation of all sentences in the
+  /// created Request.
+  /// @param [in] responseOptions: Configuration used to prepare the Response corresponding to the created request.
+  //  @returns Request created from the query parameters wrapped within a shared-pointer.
+  Ptr<Request> makeRequest(size_t requestId, std::string&& source, CallbackType callback,
+                           const ResponseOptions& responseOptions, std::optional<TranslationCache>& cache);
+
+  Ptr<Request> makePivotRequest(size_t requestId, AnnotatedText&& previousTarget, CallbackType callback,
+                                const ResponseOptions& responseOptions, std::optional<TranslationCache>& cache);
+
+  /// Relays a request to the batching-pool specific to this translation model.
+  /// @param [in] request: Request constructed through makeRequest
+  size_t enqueueRequest(Ptr<Request> request) { return batchingPool_.enqueueRequest(request); };
+
+  /// Generates a batch from the batching-pool for this translation model, compiling from several active requests. Note
+  /// that it is possible that calls to this method can give empty-batches.
+  ///
+  /// @param [out] batch: Batch to write a generated batch on to.
+  /// @returns number of sentences that constitute the Batch.
+  size_t generateBatch(Batch& batch) { return batchingPool_.generateBatch(batch); }
+
+  /// Translate a batch generated with generateBatch
+  ///
+  /// @param [in] deviceId: There are replicas of backend created for use in each worker thread. deviceId indicates
+  /// which replica to use.
+  /// @param [in] batch: A batch generated from generateBatch from the same TranslationModel instance.
+  void translateBatch(size_t deviceId, Batch& batch);
+
+  /// Returns a unique-identifier for the model.
+  size_t modelId() const { return modelId_; }
+
+ private:
+  size_t modelId_;
+  Config options_;
+  MemoryBundle memory_;
+  Vocabs vocabs_;
+  TextProcessor textProcessor_;
+
+  /// Maintains sentences from multiple requests bucketed by length and sorted by priority in each bucket.
+  BatchingPool batchingPool_;
+
+  /// A package of marian-entities which form a backend to translate.
+  struct MarianBackend {
+    using Graph = Ptr<ExpressionGraph>;
+    using ScorerEnsemble = std::vector<Ptr<Scorer>>;
+
+    Graph graph;
+    ScorerEnsemble scorerEnsemble;
+    bool initialized{false};
+  };
+
+  // ShortlistGenerator is purely const, we don't need one per thread.
+  ShortlistGenerator shortlistGenerator_;
+
+  /// Hold replicas of the backend (graph, scorers, shortlist) for use in each thread.
+  /// Controlled and consistent external access via graph(id), scorerEnsemble(id),
+  std::vector<MarianBackend> backend_;
+  std::shared_ptr<QualityEstimator> qualityEstimator_;
+
+  void loadBackend(size_t idx);
+  Ptr<marian::data::CorpusBatch> convertToMarianBatch(Batch& batch);
+
+  static std::atomic<size_t> modelCounter_;
+};
+
+}  // namespace bergamot
+}  // namespace marian
+
+#endif  //  SRC_BERGAMOT_TRANSLATION_MODEL_H_
diff --git a/inference/src/translator/utils.h b/inference/src/translator/utils.h
new file mode 100644
index 000000000..a35cebcbd
--- /dev/null
+++ b/inference/src/translator/utils.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <iostream>
+
+namespace marian::bergamot {
+
+inline std::string readFromStdin() {
+  // Read a large input text blob from stdin
+  std::ostringstream inputStream;
+  inputStream << std::cin.rdbuf();
+  std::string input = inputStream.str();
+  return input;
+}
+
+}  // namespace marian::bergamot
diff --git a/inference/src/translator/vocabs.h b/inference/src/translator/vocabs.h
new file mode 100644
index 000000000..bc5ef1406
--- /dev/null
+++ b/inference/src/translator/vocabs.h
@@ -0,0 +1,76 @@
+#pragma once
+
+namespace marian {
+namespace bergamot {
+
+/// Wrapper of Marian Vocab objects needed for translator.
+/// Holds multiple source vocabularies and one target vocabulary
+class Vocabs {
+ public:
+  /// Construct vocabs object from either byte-arrays or files
+  Vocabs(Ptr<Options> options, std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories) : options_(options) {
+    if (!vocabMemories.empty()) {
+      // load vocabs from buffer
+      load(std::move(vocabMemories));
+    } else {
+      // load vocabs from file
+      auto vocabPaths = options->get<std::vector<std::string>>("vocabs");
+      load(vocabPaths);
+    }
+  }
+
+  /// Get all source vocabularies (as a vector)
+  const std::vector<Ptr<Vocab const>>& sources() const { return srcVocabs_; }
+
+  /// Get the target vocabulary
+  const Ptr<Vocab const>& target() const { return trgVocab_; }
+
+ private:
+  std::vector<Ptr<Vocab const>> srcVocabs_;  // source vocabularies
+  Ptr<Vocab const> trgVocab_;                // target vocabulary
+  Ptr<Options> options_;
+
+  // load from buffer
+  void load(std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories) {
+    // At least two vocabs: src and trg
+    ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
+    srcVocabs_.resize(vocabMemories.size());
+    // hashMap is introduced to avoid double loading the same vocab
+    // loading vocabs (either from buffers or files) is the biggest bottleneck of the speed
+    // uintptr_t holds unique keys (address) for share_ptr<AlignedMemory>
+    std::unordered_map<uintptr_t, Ptr<Vocab>> vmap;
+    for (size_t i = 0; i < srcVocabs_.size(); i++) {
+      auto m = vmap.emplace(std::make_pair(reinterpret_cast<uintptr_t>(vocabMemories[i].get()), Ptr<Vocab>()));
+      if (m.second) {  // new: load the vocab
+        m.first->second = New<Vocab>(options_, i);
+        m.first->second->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
+      }
+      srcVocabs_[i] = m.first->second;
+    }
+    // Initialize target vocab
+    trgVocab_ = srcVocabs_.back();
+    srcVocabs_.pop_back();
+  }
+
+  // load from file
+  void load(const std::vector<std::string>& vocabPaths) {
+    // with the current setup, we need at least two vocabs: src and trg
+    ABORT_IF(vocabPaths.size() < 2, "Insufficient number of vocabularies.");
+    srcVocabs_.resize(vocabPaths.size());
+    std::unordered_map<std::string, Ptr<Vocab>> vmap;
+    for (size_t i = 0; i < srcVocabs_.size(); ++i) {
+      auto m = vmap.emplace(std::make_pair(vocabPaths[i], Ptr<Vocab>()));
+      if (m.second) {  // new: load the vocab
+        m.first->second = New<Vocab>(options_, i);
+        m.first->second->load(vocabPaths[i]);
+      }
+      srcVocabs_[i] = m.first->second;
+    }
+    // Initialize target vocab
+    trgVocab_ = srcVocabs_.back();
+    srcVocabs_.pop_back();
+  }
+};
+
+}  // namespace bergamot
+}  // namespace marian
diff --git a/inference/src/translator/xh_scanner.cpp b/inference/src/translator/xh_scanner.cpp
new file mode 100644
index 000000000..724d02cb9
--- /dev/null
+++ b/inference/src/translator/xh_scanner.cpp
@@ -0,0 +1,406 @@
+// https://www.codeproject.com/Articles/14076/Fast-and-Compact-HTML-XML-Scanner-Tokenizer
+// BSD license
+
+#include "xh_scanner.h"
+
+#include <cassert>
+#include <cctype>
+#include <cstring>
+
+namespace {
+
+// Simple replacement for string_view.ends_with(compile-time C string)
+template <typename Char_t, size_t Len>
+inline bool endsWith(markup::string_ref &str, const Char_t (&suffix)[Len]) {
+  size_t offset = str.size - (Len - 1);
+  return offset <= str.size && std::memcmp(str.data + offset, suffix, Len - 1) == 0;
+}
+
+inline bool equalsCaseInsensitive(const char *lhs, const char *rhs, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    // cast to unsigned char otherwise std::tolower has undefined behaviour
+    if (std::tolower(static_cast<unsigned char>(lhs[i])) != std::tolower(static_cast<unsigned char>(rhs[i])))
+      return false;
+  }
+
+  return true;
+}
+
+// Alias for the above, but with compile-time known C string
+template <size_t Len>
+inline bool equalsCaseInsensitive(markup::string_ref &lhs, const char (&rhs)[Len]) {
+  return lhs.size == Len - 1 && equalsCaseInsensitive(lhs.data, rhs, Len - 1);
+}
+
+template <typename Char_t, size_t Len>
+bool operator==(markup::string_ref const &str, const Char_t (&str2)[Len]) {
+  return str.size == Len - 1 && std::memcmp(str.data, str2, Len - 1) == 0;
+}
+
+template <size_t N>
+constexpr size_t length(char const (&/*unused*/)[N]) {
+  return N - 1;
+}
+
+}  // end namespace
+
+namespace markup {
+
+// case sensitive string equality test
+// s_lowcase shall be lowercase string
+std::string_view Scanner::value() const { return std::string_view(value_.data, value_.size); }
+
+std::string_view Scanner::attribute() const { return std::string_view(attributeName_.data, attributeName_.size); }
+
+std::string_view Scanner::tag() const { return std::string_view(tagName_.data, tagName_.size); }
+
+Scanner::TokenType Scanner::scanBody() {
+  value_ = string_ref{input_.pos(), 0};
+
+  start_ = input_.pos();
+
+  switch (input_.peek()) {
+    case '\0':
+      return TT_EOF;
+    case '<':
+      return scanTag();
+    case '&':
+      return scanEntity(TT_TEXT);
+  }
+
+  while (true) {
+    switch (input_.peek()) {
+      case '\0':
+      case '<':
+      case '&':
+        return TT_TEXT;
+      default:
+        input_.consume();
+        ++value_.size;
+        break;
+    }
+  }
+}
+
+// Consumes one or closing bit of a tag:
+//   <tag attr="value">...</tag>
+//       |------------|
+// Followed by:
+// - scanSpecial if <script> or <style>
+// - scanBody
+// - another scan_head for the next attribute or end of open tag
+// Returns:
+// - TT_ATTRIBUTE if attribute is read
+// - TT_TAG_END if self-closing tag
+// - TT_ERROR if wrong character encountered
+// - TT_EOF if unexpected end of input (will not return TT_ATTRIBUTE if attribute value wasn't finished yet)
+// - TT_TAG_END through scanSpecial
+// - TT_TEXT through scanBody
+Scanner::TokenType Scanner::scanAttribute() {
+  // Skip all whitespace between tag name or last attribute and next attribute or '>'
+  skipWhitespace();
+
+  // Find end of tag name
+  switch (input_.peek()) {
+    case '>':
+      input_.consume();
+
+      // Treat some elements as opaque, e.g. <script>, <style>
+      if (/*equalsCaseInsensitive(tagName_, "title") ||*/ equalsCaseInsensitive(tagName_, "script") ||
+          equalsCaseInsensitive(tagName_, "style") || equalsCaseInsensitive(tagName_, "textarea") ||
+          equalsCaseInsensitive(tagName_, "iframe") || equalsCaseInsensitive(tagName_, "noembed") ||
+          equalsCaseInsensitive(tagName_, "noscript") || equalsCaseInsensitive(tagName_, "noframes")) {
+        // script is special because we want to parse the attributes,
+        // but not the content
+        scanFun_ = &Scanner::scanSpecial;
+        return scanSpecial();
+      } else {
+        scanFun_ = &Scanner::scanBody;
+        return scanBody();
+      }
+    case '/':
+      input_.consume();
+      if (input_.peek() == '>') {
+        // self closing tag
+        input_.consume();
+        scanFun_ = &Scanner::scanBody;
+        return TT_TAG_END;
+      } else {
+        return TT_ERROR;
+      }
+  }
+
+  attributeName_ = string_ref{input_.pos(), 0};
+  value_ = string_ref{nullptr, 0};
+
+  // attribute name...
+  while (input_.peek() != '=') {
+    switch (input_.peek()) {
+      case '\0':
+        return TT_EOF;
+      case '>':
+        return TT_ATTRIBUTE;  // attribute without value (HTML style) at end of tag
+      case '<':
+        return TT_ERROR;
+      default:
+        if (skipWhitespace()) {
+          if (input_.peek() == '=') {
+            break;
+          } else {
+            return TT_ATTRIBUTE;  // attribute without value (HTML style) but not yet at end of tag
+          }
+        }
+        input_.consume();
+        ++attributeName_.size;
+        break;
+    }
+  }
+
+  // consume '=' and any following whitespace
+  input_.consume();
+  skipWhitespace();
+  // attribute value...
+
+  char quote;  // Either '"' or '\'' depending on which quote we're searching for
+  switch (input_.peek()) {
+    case '"':
+    case '\'':
+      quote = input_.consume();
+      value_ = string_ref{input_.pos(), 0};
+      while (true) {
+        if (input_.peek() == '\0') {
+          return TT_ERROR;
+        } else if (input_.peek() == quote) {
+          input_.consume();
+          return TT_ATTRIBUTE;
+        } else {
+          input_.consume();
+          ++value_.size;
+        }
+      }
+      break;
+    default:
+      value_ = string_ref{input_.pos(), 0};
+
+      while (true) {
+        if (isWhitespace(input_.peek())) return TT_ATTRIBUTE;
+        if (input_.peek() == '>') return TT_ATTRIBUTE;  // '>' will be consumed next round
+        input_.consume();
+        ++value_.size;
+      }
+      break;
+  }
+
+  // How did we end up here?!
+  return TT_ERROR;
+}
+
+// scans tag name of open or closing tag
+//   <tag attr="value">...</tag>
+//   |--|                 |----|
+// Emits:
+// - TT_TAG_START if tag head is read
+// - TT_COMMENT_START
+// - TT_PROCESSING_INSTRUCTION_START
+// - TT_CDATA_START
+// - TT_ENTITY_START
+// - TT_ERROR if unexpected character or end
+Scanner::TokenType Scanner::scanTag() {
+  start_ = input_.pos();
+  if (input_.consume() != '<') return TT_ERROR;
+
+  bool isTail = input_.peek() == '/';
+  if (isTail) input_.consume();
+
+  tagName_ = string_ref{input_.pos(), 0};
+
+  while (input_.peek()) {
+    if (skipWhitespace()) break;
+
+    if (input_.peek() == '/' || input_.peek() == '>') break;
+
+    input_.consume();
+    ++tagName_.size;
+
+    // Note: these tests are executed at every char, thus eager.
+    // "<?xml" will match on `tagName_ == "?"`.
+    if (tagName_ == "!--") {
+      scanFun_ = &Scanner::scanComment;
+      return TT_COMMENT_START;
+    } else if (tagName_ == "?") {
+      scanFun_ = &Scanner::scanProcessingInstruction;
+      return TT_PROCESSING_INSTRUCTION_START;
+    }
+  }
+
+  if (!input_.peek()) return TT_EOF;
+
+  if (isTail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
+
+  scanFun_ = &Scanner::scanAttribute;
+  return TT_TAG_START;
+}
+
+Scanner::TokenType Scanner::scanEntity(TokenType parentTokenType) {
+  // `entity` includes starting '&' and ending ';'
+  start_ = input_.pos();
+  string_ref entity{input_.pos(), 0};
+  bool hasEnd = false;
+
+  if (input_.consume() != '&') return TT_ERROR;
+
+  ++entity.size;  // Account for the consumed '&'
+
+  // Consume the entity
+  while (input_.peek()) {
+    if (input_.peek() == ';') {
+      input_.consume();
+      ++entity.size;
+      hasEnd = true;
+      break;
+    } else if (!isalpha(input_.peek())) {
+      hasEnd = false;
+      break;
+    } else {
+      input_.consume();
+      ++entity.size;
+    }
+  }
+
+  // If we can decode the entity, do so
+  if (hasEnd && resolveEntity(entity, value_)) return parentTokenType;
+
+  // Otherwise, just yield the whole thing undecoded, interpret it as text
+  value_ = entity;
+  return parentTokenType;
+}
+
+bool Scanner::resolveEntity(string_ref const &buffer, string_ref &decoded) const {
+  static char lt = '<', gt = '>', amp = '&', quot = '"', apos = '\'', nbsp = ' ';
+
+  if (buffer == "&lt;") {
+    decoded = string_ref{&lt, 1};
+    return true;
+  }
+  if (buffer == "&gt;") {
+    decoded = string_ref{&gt, 1};
+    return true;
+  }
+  if (buffer == "&amp;") {
+    decoded = string_ref{&amp, 1};
+    return true;
+  }
+  if (buffer == "&quot;") {
+    decoded = string_ref{&quot, 1};
+    return true;
+  }
+  if (buffer == "&apos;") {
+    decoded = string_ref{&apos, 1};
+    return true;
+  }
+  if (buffer == "&nbsp;") {
+    decoded = string_ref{&nbsp, 1};  // TODO: handle non-breaking spaces better than just converting them to spaces
+    return true;
+  }
+  return false;
+}
+
+// skip whitespaces.
+// returns how many whitespaces were skipped
+size_t Scanner::skipWhitespace() {
+  size_t skipped = 0;
+  while (isWhitespace(input_.peek())) {
+    input_.consume();
+    ++skipped;
+  }
+  return skipped;
+}
+
+bool Scanner::isWhitespace(char c) {
+  return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f');
+}
+
+Scanner::TokenType Scanner::scanComment() {
+  if (gotTail_) {
+    start_ = input_.pos() - length("-->");  // minus "-->"
+    scanFun_ = &Scanner::scanBody;
+    gotTail_ = false;
+    return TT_COMMENT_END;
+  }
+
+  start_ = input_.pos();
+  value_ = string_ref{input_.pos(), 0};
+
+  while (true) {
+    if (input_.consume() == '\0') return TT_EOF;
+    ++value_.size;
+
+    if (endsWith(value_, "-->")) {
+      gotTail_ = true;
+      value_.size -= length("-->");
+      break;
+    }
+  }
+  return TT_DATA;
+}
+
+Scanner::TokenType Scanner::scanProcessingInstruction() {
+  if (gotTail_) {
+    start_ = input_.pos() - length("?>");
+    scanFun_ = &Scanner::scanBody;
+    gotTail_ = false;
+    return TT_PROCESSING_INSTRUCTION_END;
+  }
+
+  start_ = input_.pos();
+  value_ = string_ref{input_.pos(), 0};
+
+  while (true) {
+    if (input_.consume() == '\0') return TT_EOF;
+    ++value_.size;
+
+    if (endsWith(value_, "?>")) {
+      gotTail_ = true;
+      value_.size -= length("?>");
+      break;
+    }
+  }
+  return TT_DATA;
+}
+
+Scanner::TokenType Scanner::scanSpecial() {
+  if (gotTail_) {
+    start_ = input_.pos() - (tagName_.size + length("</>"));
+    scanFun_ = &Scanner::scanBody;
+    gotTail_ = false;
+    return TT_TAG_END;
+  }
+
+  start_ = input_.pos();
+  value_ = string_ref{input_.pos(), 0};
+
+  while (true) {
+    if (input_.consume() == '\0') return TT_EOF;
+    ++value_.size;
+
+    // Test for </tag>
+    // TODO: no whitespaces allowed? Is that okay?
+    if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + length("</>")) {
+      // Test for the "</"" bit of "</tag>"
+      size_t posTagStart = value_.size - tagName_.size - length("</>");
+      if (std::memcmp(value_.data + posTagStart, "</", length("</")) != 0) continue;
+
+      // Test for the "tag" bit of "</tag>". Doing case insensitive compare because <I>...</i> is okay.
+      size_t posTagName = value_.size - tagName_.size - length(">");  // end - tag>
+      if (!equalsCaseInsensitive(value_.data + posTagName, tagName_.data, tagName_.size)) continue;
+
+      gotTail_ = true;
+      value_.size -= tagName_.size + length("</>");
+      break;
+    }
+  }
+
+  return TT_DATA;
+}
+
+}  // namespace markup
diff --git a/inference/src/translator/xh_scanner.h b/inference/src/translator/xh_scanner.h
new file mode 100644
index 000000000..530df675d
--- /dev/null
+++ b/inference/src/translator/xh_scanner.h
@@ -0,0 +1,148 @@
+// https://www.codeproject.com/Articles/14076/Fast-and-Compact-HTML-XML-Scanner-Tokenizer
+// BSD license
+//|
+//| simple and fast XML/HTML scanner/tokenizer
+//|
+//| (C) Andrew Fedoniouk @ terrainformatica.com
+//|
+#include <cassert>
+#include <cstring>
+#include <string_view>
+
+namespace markup {
+
+struct instream {
+  const char *p;
+  const char *begin;
+  const char *end;
+  explicit instream(const char *src) : p(src), begin(src), end(src + strlen(src)) {}
+  instream(const char *begin, const char *end) : p(begin), begin(begin), end(end) {}
+  char consume() { return p < end ? *p++ : 0; }
+  char peek() const { return p < end ? *p : 0; }
+  const char *pos() const { return p; }
+};
+
+// Think string_view, but with a mutable range
+struct string_ref {
+  const char *data;
+  size_t size;
+};
+
+class Scanner {
+ public:
+  enum TokenType {
+    TT_ERROR = -1,
+    TT_EOF = 0,
+
+    TT_TAG_START,                     // <tag ...
+                                      //     ^-- happens here
+                                      //
+    TT_TAG_END,                       // </tag>
+                                      //       ^-- happens here
+                                      // <tag ... />
+                                      //            ^-- or here
+                                      //
+    TT_ATTRIBUTE,                     // <tag attr="value" >
+                                      //                 ^-- happens here, attr_name() and value()
+                                      //                     will be filled with 'attr' and 'value'.
+                                      //
+    TT_TEXT,                          // <tag>xxx</tag>
+                                      //         ^-- happens here
+                                      // <tag>foo &amp;&amp; bar</tag>
+                                      //          ^---^----^----^-- and all of here as well
+                                      // Comes after TT_TAG_START or as the first token if the input
+                                      // begins with text instead of a root element.
+                                      //
+    TT_DATA,                          // <!-- foo -->
+                                      //         ^-- here
+                                      // <? ... ?>
+                                      //       ^-- as well as here
+                                      // <script>...</script>
+                                      //            ^-- or here
+                                      // <style>...</style>
+                                      //           ^-- or here
+                                      // comes after TT_COMMENT_START, TT_PI_START, or TT_TAG_START
+                                      // if the tag was <script> or <style>.
+                                      //
+    TT_COMMENT_START,                 // <!-- foo -->
+                                      //     ^-- happens here
+                                      //
+    TT_COMMENT_END,                   // <!-- foo -->
+                                      //             ^-- happens here
+                                      //
+    TT_PROCESSING_INSTRUCTION_START,  // <?xml version="1.0?>
+                                      //   ^-- happens here
+                                      //
+    TT_PROCESSING_INSTRUCTION_END,    // <?xml version="1.0?>
+                                      //                     ^-- would you believe this happens here
+  };
+
+ public:
+  explicit Scanner(instream &is)
+      : value_{nullptr, 0},
+        tagName_{nullptr, 0},
+        attributeName_{nullptr, 0},
+        input_(is),
+        start_(nullptr),
+        scanFun_(&Scanner::scanBody),
+        gotTail_(false) {}
+
+  // get next token
+  TokenType next() { return (this->*scanFun_)(); }
+
+  // get value of TT_TEXT, TT_ATTR and TT_DATA
+  std::string_view value() const;
+
+  // get attribute name
+  std::string_view attribute() const;
+
+  // get tag name
+  std::string_view tag() const;
+
+  inline const char *start() const { return start_; }
+
+ private: /* methods */
+  typedef TokenType (Scanner::*ScanPtr)();
+
+  // Consumes the text around and between tags
+  TokenType scanBody();
+
+  // Consumes name="attr"
+  TokenType scanAttribute();
+
+  // Consumes <!-- ... -->
+  TokenType scanComment();
+
+  // Consumes <?name [attrs]?>
+  TokenType scanProcessingInstruction();
+
+  // Consumes ...</style> and ...</script>
+  TokenType scanSpecial();
+
+  // Consumes <tagname and </tagname>
+  TokenType scanTag();
+
+  // Consumes '&amp;' etc, emits parent_token_type
+  TokenType scanEntity(TokenType parentTokenType);
+
+  size_t skipWhitespace();
+
+  bool resolveEntity(string_ref const &buffer, string_ref &decoded) const;
+
+  static bool isWhitespace(char c);
+
+ private: /* data */
+  string_ref value_;
+  string_ref tagName_;
+  string_ref attributeName_;
+
+  ScanPtr scanFun_;  // current 'reader'
+
+  instream &input_;
+
+  // Start position of a token.
+  const char *start_;
+
+  bool gotTail_;  // aux flag used in scanComment, scanSpecial, scanProcessingInstruction
+};
+}  // namespace markup
diff --git a/inference/wasm/CMakeLists.txt b/inference/wasm/CMakeLists.txt
new file mode 100644
index 000000000..ef8fd988a
--- /dev/null
+++ b/inference/wasm/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_executable(bergamot-translator-worker
+    bindings/service_bindings.cpp
+    bindings/response_options_bindings.cpp
+    bindings/response_bindings.cpp
+)
+
+# Generate version file that can be included in the wasm artifacts
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.js.in
+               ${CMAKE_CURRENT_BINARY_DIR}/project_version.js @ONLY)
+
+# This header inclusion needs to go away later as path to public headers of bergamot
+# translator should be directly available from "bergamot-translator" target
+target_include_directories(bergamot-translator-worker
+    PRIVATE ${CMAKE_SOURCE_DIR}/src/translator
+    PRIVATE ${CMAKE_SOURCE_DIR}
+)
+
+# This compile definition is required for generating binding code properly
+target_compile_definitions(bergamot-translator-worker PRIVATE WASM_BINDINGS)
+target_compile_options(bergamot-translator-worker PRIVATE ${WASM_COMPILE_FLAGS})
+target_link_options(bergamot-translator-worker PRIVATE ${WASM_LINK_FLAGS})
+target_link_options(bergamot-translator-worker PRIVATE --extern-pre-js=${CMAKE_CURRENT_BINARY_DIR}/project_version.js)
+
+set_target_properties(bergamot-translator-worker PROPERTIES
+                        SUFFIX ".js"
+                        RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+                      )
+
+target_link_libraries(bergamot-translator-worker bergamot-translator)
diff --git a/inference/wasm/bindings/response_bindings.cpp b/inference/wasm/bindings/response_bindings.cpp
new file mode 100644
index 000000000..51a46ab84
--- /dev/null
+++ b/inference/wasm/bindings/response_bindings.cpp
@@ -0,0 +1,32 @@
+/*
+ * Bindings for Response class
+ *
+ */
+
+#include <emscripten/bind.h>
+
+#include <vector>
+
+#include "response.h"
+
+using Response = marian::bergamot::Response;
+using ByteRange = marian::bergamot::ByteRange;
+
+using namespace emscripten;
+
+// Binding code
+EMSCRIPTEN_BINDINGS(byte_range) {
+  value_object<ByteRange>("ByteRange").field("begin", &ByteRange::begin).field("end", &ByteRange::end);
+}
+
+EMSCRIPTEN_BINDINGS(response) {
+  class_<Response>("Response")
+      .constructor<>()
+      .function("size", &Response::size)
+      .function("getOriginalText", &Response::getOriginalText)
+      .function("getTranslatedText", &Response::getTranslatedText)
+      .function("getSourceSentence", &Response::getSourceSentenceAsByteRange)
+      .function("getTranslatedSentence", &Response::getTargetSentenceAsByteRange);
+
+  register_vector<Response>("VectorResponse");
+}
diff --git a/inference/wasm/bindings/response_options_bindings.cpp b/inference/wasm/bindings/response_options_bindings.cpp
new file mode 100644
index 000000000..06c152a7c
--- /dev/null
+++ b/inference/wasm/bindings/response_options_bindings.cpp
@@ -0,0 +1,21 @@
+/*
+ * Bindings for ResponseOptions class
+ *
+ */
+
+#include <emscripten/bind.h>
+
+#include "response_options.h"
+
+using ResponseOptions = marian::bergamot::ResponseOptions;
+
+using namespace emscripten;
+
+// Binding code
+EMSCRIPTEN_BINDINGS(response_options) {
+  value_object<ResponseOptions>("ResponseOptions")
+      .field("qualityScores", &ResponseOptions::qualityScores)
+      .field("alignment", &ResponseOptions::alignment)
+      .field("html", &ResponseOptions::HTML);
+  register_vector<ResponseOptions>("VectorResponseOptions");
+}
diff --git a/inference/wasm/bindings/service_bindings.cpp b/inference/wasm/bindings/service_bindings.cpp
new file mode 100644
index 000000000..54675a498
--- /dev/null
+++ b/inference/wasm/bindings/service_bindings.cpp
@@ -0,0 +1,93 @@
+/*
+ * Bindings for Service class
+ */
+
+#include <emscripten/bind.h>
+
+#include "service.h"
+
+using namespace emscripten;
+
+using BlockingService = marian::bergamot::BlockingService;
+using TranslationModel = marian::bergamot::TranslationModel;
+using AlignedMemory = marian::bergamot::AlignedMemory;
+using MemoryBundle = marian::bergamot::MemoryBundle;
+
+val getByteArrayView(AlignedMemory& alignedMemory) {
+  return val(typed_memory_view(alignedMemory.size(), alignedMemory.as<char>()));
+}
+
+EMSCRIPTEN_BINDINGS(aligned_memory) {
+  class_<AlignedMemory>("AlignedMemory")
+      .constructor<std::size_t, std::size_t>()
+      .function("size", &AlignedMemory::size)
+      .function("getByteArrayView", &getByteArrayView);
+
+  register_vector<AlignedMemory*>("AlignedMemoryList");
+}
+
+// When source and target vocab files are same, only one memory object is passed from JS to
+// avoid allocating memory twice for the same file. However, the constructor of the Service
+// class still expects 2 entries in this case, where each entry has the shared ownership of the
+// same AlignedMemory object. This function prepares these smart pointer based AlignedMemory objects
+// for unique AlignedMemory objects passed from JS.
+std::vector<std::shared_ptr<AlignedMemory>> prepareVocabsSmartMemories(std::vector<AlignedMemory*>& vocabsMemories) {
+  auto sourceVocabMemory = std::make_shared<AlignedMemory>(std::move(*(vocabsMemories[0])));
+  std::vector<std::shared_ptr<AlignedMemory>> vocabsSmartMemories;
+  vocabsSmartMemories.push_back(sourceVocabMemory);
+  if (vocabsMemories.size() == 2) {
+    auto targetVocabMemory = std::make_shared<AlignedMemory>(std::move(*(vocabsMemories[1])));
+    vocabsSmartMemories.push_back(std::move(targetVocabMemory));
+  } else {
+    vocabsSmartMemories.push_back(sourceVocabMemory);
+  }
+  return vocabsSmartMemories;
+}
+
+MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory, AlignedMemory* shortlistMemory,
+                                 std::vector<AlignedMemory*> uniqueVocabsMemories,
+                                 AlignedMemory* qualityEstimatorMemory) {
+  MemoryBundle memoryBundle;
+  memoryBundle.models.emplace_back(std::move(*modelMemory));
+  memoryBundle.shortlist = std::move(*shortlistMemory);
+  memoryBundle.vocabs = std::move(prepareVocabsSmartMemories(uniqueVocabsMemories));
+  if (qualityEstimatorMemory != nullptr) {
+    memoryBundle.qualityEstimatorMemory = std::move(*qualityEstimatorMemory);
+  }
+
+  return memoryBundle;
+}
+
+// This allows only shared_ptrs to be operational in JavaScript, according to emscripten.
+// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/embind.html#smart-pointers
+std::shared_ptr<TranslationModel> TranslationModelFactory(const std::string& config, AlignedMemory* model,
+                                                          AlignedMemory* shortlist, std::vector<AlignedMemory*> vocabs,
+                                                          AlignedMemory* qualityEstimator) {
+  MemoryBundle memoryBundle = prepareMemoryBundle(model, shortlist, vocabs, qualityEstimator);
+  return std::make_shared<TranslationModel>(config, std::move(memoryBundle));
+}
+
+EMSCRIPTEN_BINDINGS(translation_model) {
+  class_<TranslationModel>("TranslationModel")
+      .smart_ptr_constructor("TranslationModel", &TranslationModelFactory, allow_raw_pointers());
+}
+
+EMSCRIPTEN_BINDINGS(blocking_service_config) {
+  value_object<BlockingService::Config>("BlockingServiceConfig")
+      .field("cacheSize", &BlockingService::Config::cacheSize);
+}
+
+std::shared_ptr<BlockingService> BlockingServiceFactory(const BlockingService::Config& config) {
+  auto copy = config;
+  copy.logger.level = "critical";
+  return std::make_shared<BlockingService>(copy);
+}
+
+EMSCRIPTEN_BINDINGS(blocking_service) {
+  class_<BlockingService>("BlockingService")
+      .smart_ptr_constructor("BlockingService", &BlockingServiceFactory)
+      .function("translate", &BlockingService::translateMultiple)
+      .function("translateViaPivoting", &BlockingService::pivotMultiple);
+
+  register_vector<std::string>("VectorString");
+}
diff --git a/inference/wasm/import-gemm-module.js b/inference/wasm/import-gemm-module.js
new file mode 100644
index 000000000..6430096dc
--- /dev/null
+++ b/inference/wasm/import-gemm-module.js
@@ -0,0 +1,46 @@
+
+/* Use an optimized gemm implementation if available, otherwise use the fallback
+ * implementation.
+ */
+function createWasmGemm() {
+    // A map of expected gemm function to the corresponding fallback gemm function names.
+    const GEMM_TO_FALLBACK_FUNCTIONS_MAP = {
+        "int8_prepare_a": "int8PrepareAFallback",
+        "int8_prepare_b": "int8PrepareBFallback",
+        "int8_prepare_b_from_transposed": "int8PrepareBFromTransposedFallback",
+        "int8_prepare_b_from_quantized_transposed": "int8PrepareBFromQuantizedTransposedFallback",
+        "int8_prepare_bias": "int8PrepareBiasFallback",
+        "int8_multiply_and_add_bias": "int8MultiplyAndAddBiasFallback",
+        "int8_select_columns_of_b": "int8SelectColumnsOfBFallback"
+    };
+
+    // Name of the optimized gemm implementation.
+    const OPTIMIZED_GEMM = "mozIntGemm";
+
+    const optimizedGemmModule = WebAssembly[OPTIMIZED_GEMM];
+    if (!optimizedGemmModule) {
+        return fallbackGemm(GEMM_TO_FALLBACK_FUNCTIONS_MAP);
+    }
+
+    const optimizedGemmModuleExports = new WebAssembly.Instance(optimizedGemmModule(), {"": {memory: wasmMemory}}).exports;
+    for (let key in GEMM_TO_FALLBACK_FUNCTIONS_MAP) {
+        if (!optimizedGemmModuleExports[key]) {
+            return fallbackGemm(GEMM_TO_FALLBACK_FUNCTIONS_MAP);
+        }
+    }
+    console.log(`Using optimized gemm (${OPTIMIZED_GEMM}) implementation`);
+    return optimizedGemmModuleExports;
+}
+
+// Return the fallback gemm implementation.
+function fallbackGemm(gemmToFallbackFunctionsMap) {
+    // The fallback gemm implementation
+    const FALLBACK_GEMM = "asm";
+
+    let fallbackGemmModuleExports = {};
+    for (let key in gemmToFallbackFunctionsMap) {
+        fallbackGemmModuleExports[key] = (...a) => Module[FALLBACK_GEMM][gemmToFallbackFunctionsMap[key]](...a)
+    }
+    console.log(`Using fallback gemm implementation`);
+    return fallbackGemmModuleExports;
+}
diff --git a/inference/wasm/module/main.js b/inference/wasm/module/main.js
new file mode 100644
index 000000000..d712a2199
--- /dev/null
+++ b/inference/wasm/module/main.js
@@ -0,0 +1,21 @@
+import * as readline from 'node:readline/promises';
+import {stdin, stdout} from 'node:process';
+import {BatchTranslator} from "./translator.js";
+
+const rl = readline.createInterface({input: stdin, output: stdout});
+
+const translator = new BatchTranslator();
+
+for await (const line of rl) {
+	const response = await translator.translate({
+		from: "en",
+		to: "es",
+		text: line,
+		html: false,
+		qualityScores: false
+	});
+
+	console.log(response.target.text);
+}
+
+translator.delete();
diff --git a/inference/wasm/module/package.json b/inference/wasm/module/package.json
new file mode 100644
index 000000000..f30464665
--- /dev/null
+++ b/inference/wasm/module/package.json
@@ -0,0 +1,39 @@
+{
+  "name": "@browsermt/bergamot-translator",
+  "version": "0.4.9",
+  "description": "Cross platform C++ library focusing on optimized machine translation on the consumer-grade device.",
+  "homepage": "https://github.com/browsermt/bergamot-translator#readme",
+  "repository": {
+    "type": "git",
+    "url": "git+ssh://git@github.com/browsermt/bergamot-translator.git"
+  },
+  "keywords": [
+    "machine",
+    "translation"
+  ],
+  "author": "",
+  "license": "MPL-2.0",
+  "bugs": {
+    "url": "https://github.com/browsermt/bergamot-translator/issues"
+  },
+  "type": "module",
+  "main": "translator.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "files": [
+    "worker/bergamot-translator-worker.js",
+    "worker/bergamot-translator-worker.wasm",
+    "worker/translator-worker.js",
+    "translator.js",
+    "main.js"
+  ],
+  "config": {
+    "emscripten_version": "3.1.8"
+  },
+  "scripts": {
+    "prepare": "test -f worker/bergamot-translator-worker.wasm || npm run build",
+    "build": "mkdir -p ../../build-wasm && docker run --rm -v $(realpath ../../):/src -v $(realpath ../../build-wasm):/build -v $(pwd)/worker:/dst -w /build emscripten/emsdk:$npm_package_config_emscripten_version sh -c \"emcmake cmake -DCOMPILE_WASM=on -DWORMHOLE=off /src && emmake make -j2 && cp bergamot-translator-worker.wasm bergamot-translator-worker.js /dst\"",
+    "test": "echo \"Hello world!\" | node main.js"
+  }
+}
diff --git a/inference/wasm/module/translator.js b/inference/wasm/module/translator.js
new file mode 100644
index 000000000..f27c07653
--- /dev/null
+++ b/inference/wasm/module/translator.js
@@ -0,0 +1,879 @@
+/**
+ * @typedef {Object} TranslationRequest
+ * @property {String} from
+ * @property {String} to
+ * @property {String} text
+ * @property {Boolean} html
+ * @property {Integer?} priority
+ */
+
+/**
+ * @typedef {Object} TranslationResponse
+ * @property {TranslationRequest} request
+ * @property {{text: string}} target
+ */
+
+/**
+ * NodeJS compatibility, a thin WebWorker layer around node:worker_threads.
+ */
+if (!(typeof window !== 'undefined' && window.Worker)) {
+    globalThis.Worker = class {
+        #worker;
+
+        constructor(url) {
+            this.#worker = new Promise(async (accept) => {
+                const {Worker} = await import(/* webpackIgnore: true */ 'node:worker_threads');
+                accept(new Worker(url));
+            });
+        }
+
+        addEventListener(eventName, callback) {
+            this.#worker.then(worker => worker.on(eventName, (data) => callback({data})));
+        }
+
+        postMessage(message) {
+            this.#worker.then(worker => worker.postMessage(message));
+        }
+
+        terminate() {
+            this.#worker.then(worker => worker.terminate());
+        }
+    }
+}
+
+/**
+ * Thrown when a pending translation is replaced by another newer pending
+ * translation.
+ */
+export class SupersededError extends Error {}
+
+
+/**
+ * Thrown when a translation was removed from the queue.
+ */
+export class CancelledError extends Error {}
+
+
+/**
+ * Wrapper around bergamot-translator loading and model management.
+ */
+ export class TranslatorBacking {
+    
+    /**
+     * @param {{
+     *  cacheSize?: number,
+     *  useNativeIntGemm?: boolean,
+     *  downloadTimeout?: number,
+     *  registryUrl?: string
+     *  pivotLanguage?: string?
+     *  onerror?: (err: Error)
+     * }} options
+     */
+    constructor(options) {
+        this.options = options || {};
+
+        this.registryUrl = this.options.registryUrl || 'https://bergamot.s3.amazonaws.com/models/index.json';
+
+        this.downloadTimeout = 'downloadTimeout' in this.options ? parseInt(this.options.downloadTimeout) : 60000;
+
+        /**
+         * registry of all available models and their urls
+         * @type {Promise<Model[]>}
+         */
+        this.registry = this.loadModelRegistery();
+
+        /**
+         * Map of downloaded model data files as buffers per model.
+         * @type {Map<{from:string,to:string}, Promise<Map<string,ArrayBuffer>>>}
+         */
+        this.buffers = new Map();
+
+        /**
+         * @type {string?}
+         */
+        this.pivotLanguage = 'pivotLanguage' in this.options ? options.pivotLanguage : 'en';
+        
+        /**
+         * A map of language-pairs to a list of models you need for it.
+         * @type {Map<{from:string,to:string}, Promise<{from:string,to:string}[]>>}
+         */
+        this.models = new Map();
+
+        /**
+         * Error handler for all errors that are async, not tied to a specific
+         * call and that are unrecoverable.
+         * @type {(error: Error)}
+         */
+        this.onerror = this.options.onerror || (err => console.error('WASM Translation Worker error:', err));
+    }
+
+    /**
+     * Loads a worker thread, and wraps it in a message passing proxy. I.e. it
+     * exposes the entire interface of TranslationWorker here, and all calls
+     * to it are async. Do note that you can only pass arguments that survive
+     * being copied into a message. 
+     * @return {Promise<{worker:Worker, exports:Proxy<TranslationWorker>}>}
+     */
+    async loadWorker() {
+        const worker = new Worker(new URL('./worker/translator-worker.js', import.meta.url));
+
+        /**
+         * Incremental counter to derive request/response ids from.
+         */
+        let serial = 0;
+
+        /**
+         * Map of pending requests
+         * @type {Map<number,{accept:(any), reject:(Error)}>}
+         */
+        const pending = new Map();
+
+        // Function to send requests
+        const call = (name, ...args) => new Promise((accept, reject) => {
+            const id = ++serial;
+            pending.set(id, {
+                accept,
+                reject,
+                callsite: { // for debugging which call caused the error
+                    message: `${name}(${args.map(arg => String(arg)).join(', ')})`,
+                    stack: new Error().stack
+                }
+            });
+            worker.postMessage({id, name, args});
+        });
+
+        // … receive responses
+        worker.addEventListener('message', function({data: {id, result, error}}) {
+            if (!pending.has(id)) {
+                console.debug('Received message with unknown id:', arguments[0]);
+                throw new Error(`BergamotTranslator received response from worker to unknown call '${id}'`);
+            }
+
+            const {accept, reject, callsite} = pending.get(id);
+            pending.delete(id);
+
+            if (error !== undefined)
+                reject(Object.assign(new Error(), error, {
+                    message: error.message + ` (response to ${callsite.message})`,
+                    stack: error.stack ? `${error.stack}\n${callsite.stack}` : callsite.stack
+                }));
+            else
+                accept(result);
+        });
+
+        // … and general errors
+        worker.addEventListener('error', this.onerror.bind(this));
+
+        // Await initialisation. This will also nicely error out if the WASM
+        // runtime fails to load.
+        await call('initialize', this.options);
+
+        /**
+         * Little wrapper around the message passing api of Worker to make it
+         * easy to await a response to a sent message. This wraps the worker in
+         * a Proxy so you can treat it as if it is an instance of the
+         * TranslationWorker class that lives inside the worker. All function
+         * calls to it are transparently passed through the message passing
+         * channel.
+         */
+        return {
+            worker,
+            exports: new Proxy({}, {
+                get(target, name, receiver) {
+                    // Prevent this object from being marked "then-able"
+                    if (name !== 'then')
+                        return (...args) => call(name, ...args);
+                }
+            })
+        };
+    }
+
+    /**
+     * Loads the model registry. Uses the registry shipped with this extension,
+     * but formatted a bit easier to use, and future-proofed to be swapped out
+     * with a TranslateLocally type registry.
+     * @return {Promise<{
+     *   from: string,
+     *   to: string,
+     *   files: {
+     *     [part:string]: {
+     *       name: string,
+     *       size: number,
+     *       expectedSha256Hash: string
+     *     }
+     *   }[]
+     * }>}
+     */
+    async loadModelRegistery() {
+        const response = await fetch(this.registryUrl, {credentials: 'omit'});
+        const registry = await response.json();
+
+        // Add 'from' and 'to' keys for each model.
+        return Array.from(Object.entries(registry), ([key, files]) => {
+            return {
+                from: key.substring(0, 2),
+                to: key.substring(2, 4),
+                files
+            }
+        });
+    }
+
+    /**
+     * Gets or loads translation model data. Caching wrapper around
+     * `loadTranslationModel()`.
+     * @param {{from:string, to:string}}
+     * @return {Promise<{
+     *   model: ArrayBuffer,
+     *   vocab: ArrayBuffer,
+     *   shortlist: ArrayBuffer,
+     *   qualityModel: ArrayBuffer?
+     * }>}
+     */
+    getTranslationModel({from, to}, options) {
+        const key = JSON.stringify({from, to});
+
+        if (!this.buffers.has(key)) {
+            const promise = this.loadTranslationModel({from, to}, options);
+
+            // set the promise so we return the same promise when its still pending
+            this.buffers.set(key, promise);
+
+            // But if loading fails, remove the promise again so we can try again later
+            promise.catch(err => this.buffers.delete(key))
+        }
+
+        return this.buffers.get(key);
+    }
+
+    /**
+     * Downloads a translation model and returns a set of
+     * ArrayBuffers. These can then be passed to a TranslationWorker thread
+     * to instantiate a TranslationModel inside the WASM vm.
+     * @param {{from:string, to:string}}
+     * @param {{signal:AbortSignal?}?}
+     * @return {Promise<{
+     *   model: ArrayBuffer,
+     *   vocab: ArrayBuffer,
+     *   shortlist: ArrayBuffer,
+     *   qualityModel: ArrayBuffer?
+     *   config: string?
+     * }>}
+     */
+    async loadTranslationModel({from, to}, options) {
+        performance.mark(`loadTranslationModule.${JSON.stringify({from, to})}`);
+
+        // Find that model in the registry which will tell us about its files
+        const entries = (await this.registry).filter(model => model.from == from && model.to == to);
+
+        if (!entries)
+            throw new Error(`No model for '${from}' -> '${to}'`);
+
+        const files = entries[0].files;
+
+        const abort = () => reject(new CancelledError('abort signal'));
+
+        // Promise that resolves (or rejects really) when the abort signal hits
+        const escape = new Promise((accept, reject) => {
+            if (options?.signal)
+                options.signal.addEventListener('abort', abort);
+        });
+
+        // Download all files mentioned in the registry entry. Race the promise
+        // of all fetch requests, and a promise that rejects on the abort signal
+        const buffers = Object.fromEntries(await Promise.race([
+            Promise.all(Object.entries(files).map(async ([part, file]) => {
+                // Special case where qualityModel is not part of the model, and this
+                // should also catch the `config` case.
+                if (file === undefined || file.name === undefined)
+                    return [part, null];
+
+                try {
+                    return [part, await this.fetch(file.name, file.expectedSha256Hash, options)];
+                } catch (cause) {
+                    throw new Error(`Could not fetch ${file.name} for ${from}->${to} model`, {cause});
+                }
+            })),
+            escape
+        ]));
+
+        // Nothing to abort now, clean up abort promise
+        if (options?.signal)
+            options.signal.removeEventListener('abort', abort);
+
+        performance.measure('loadTranslationModel', `loadTranslationModule.${JSON.stringify({from, to})}`);
+
+        let vocabs = [];
+
+        if (buffers.vocab)
+            vocabs = [buffers.vocab]
+        else if (buffers.trgvocab && buffers.srcvocab)
+            vocabs = [buffers.srcvocab, buffers.trgvocab]
+        else
+            throw new Error(`Could not identify vocab files for ${from}->${to} model among: ${Array.from(Object.keys(files)).join(' ')}`);
+
+        let config = {};
+
+        // For the Ukrainian models we need to override the gemm-precision
+        if (files.model.name.endsWith('intgemm8.bin'))
+            config['gemm-precision'] = 'int8shiftAll';
+
+        // If quality estimation is used, we need to turn off skip-cost. Turning
+        // this off causes quite the slowdown.
+        if (files.qualityModel)
+            config['skip-cost'] = false;
+
+        // Allow the registry to also specify marian configuration parameters
+        if (files.config)
+            Object.assign(config, files.config);
+
+        // Translate to generic bergamot-translator format that also supports
+        // separate vocabularies for input & output language, and calls 'lex'
+        // a more descriptive 'shortlist'.
+        return {
+            model: buffers.model,
+            shortlist: buffers.lex,
+            vocabs,
+            qualityModel: buffers.qualityModel,
+            config
+        };
+    }
+
+    /**
+     * Helper to download file from the web. Verifies the checksum.
+     * @param {string} url
+     * @param {string?} checksum sha256 checksum as hexadecimal string
+     * @param {{signal:AbortSignal}?} extra fetch options
+     * @returns {Promise<ArrayBuffer>}
+     */
+    async fetch(url, checksum, extra) {
+        // Rig up a timeout cancel signal for our fetch
+        const controller = new AbortController();
+        const abort = () => controller.abort();
+
+        const timeout = this.downloadTimeout ? setTimeout(abort, this.downloadTimeout) : null;
+
+        try {
+            // Also maintain the original abort signal
+            if (extra?.signal)
+                extra.signal.addEventListener('abort', abort);
+
+            const options = {
+                credentials: 'omit',
+                signal: controller.signal,
+            };
+
+            if (checksum)
+                options['integrity'] = `sha256-${this.hexToBase64(checksum)}`;
+
+            // Disable the integrity check for NodeJS because of
+            // https://github.com/nodejs/undici/issues/1594
+            if (typeof window === 'undefined')
+                delete options['integrity'];
+
+            // Start downloading the url, using the hex checksum to ask
+            // `fetch()` to verify the download using subresource integrity 
+            const response = await fetch(url, options);
+
+            // Finish downloading (or crash due to timeout)
+            return await response.arrayBuffer();
+
+        } finally {
+            if (timeout)
+                clearTimeout(timeout);
+
+            if (extra?.signal)
+                extra.signal.removeEventListener('abort', abort);
+        }
+    }
+
+    /**
+     * Converts the hexadecimal hashes from the registry to something we can use with
+     * the fetch() method.
+     */
+    hexToBase64(hexstring) {
+        return btoa(hexstring.match(/\w{2}/g).map(function(a) {
+            return String.fromCharCode(parseInt(a, 16));
+        }).join(""));
+    }
+
+    /**
+     * Crappy named method that gives you a list of models to translate from
+     * one language into the other. Generally this will be the same as you
+     * just put in if there is a direct model, but it could return a list of
+     * two models if you need to pivot through a third language.
+     * Returns just [{from:str,to:str}...]. To be used something like this:
+     * ```
+     * const models = await this.getModels(from, to);
+     * models.forEach(({from, to}) => {
+     *   const buffers = await this.loadTranslationModel({from,to});
+     *   [TranslationWorker].loadTranslationModel({from,to}, buffers)
+     * });
+     * ```
+     * @returns {Promise<TranslationModel[]>}
+     */
+    getModels({from, to}) {
+        const key = JSON.stringify({from, to});
+
+        // Note that the `this.models` map stores Promises. This so that
+        // multiple calls to `getModels` that ask for the same model will
+        // return the same promise, and the actual lookup is only done once.
+        // The lookup is async because we need to await `this.registry`
+        if (!this.models.has(key))
+            this.models.set(key, this.findModels(from, to));
+
+        return this.models.get(key);
+    }
+
+    /**
+     * Find model (or model pair) to translate from `from` to `to`.
+     * @param {string} from
+     * @param {string} to
+     * @returns {Promise<TranslationModel[]>}
+     */
+    async findModels(from, to) {
+        const registry = await this.registry;
+
+        let direct = [], outbound = [], inbound = [];
+
+        registry.forEach(model => {
+            if (model.from === from && model.to === to)
+                direct.push(model);
+            else if (model.from === from && model.to === this.pivotLanguage)
+                outbound.push(model);
+            else if (model.to === to && model.from === this.pivotLanguage)
+                inbound.push(model);
+        });
+
+        if (direct.length)
+            return [direct[0]];
+
+        if (outbound.length && inbound.length)
+            return [outbound[0], inbound[0]];
+
+        throw new Error(`No model available to translate from '${from}' to '${to}'`);
+    }
+}
+
+/**
+ * Translator balancing between throughput and latency. Can use multiple worker
+ * threads.
+ */
+export class BatchTranslator {
+    /**
+     * @param {{
+     *  cacheSize?: number,
+     *  useNativeIntGemm?: boolean,
+     *  workers?: number,
+     *  batchSize?: number,
+     *  downloadTimeout?: number,
+     *  workerUrl?: string,
+     *  registryUrl?: string
+     *  pivotLanguage?: string?
+     * }} options
+     */
+    constructor(options, backing) {
+        if (!backing)
+            backing = new TranslatorBacking(options);
+
+        this.backing = backing;
+
+        /**
+         * @type {Array<{idle:Boolean, worker:Proxy}>} List of active workers
+         * (and a flag to mark them idle or not)
+         */
+        this.workers = [];
+
+        /**
+         * Maximum number of workers
+         * @type {number} 
+         */
+        this.workerLimit = Math.max(options?.workers || 0, 1);
+
+        /**
+         * List of batches we push() to & shift() from using `enqueue`.
+         * @type {{
+         *    id: number,
+         *    key: string,
+         *    priority: number,
+         *    models: TranslationModel[],
+         *    requests: Array<{
+         *      request: TranslationRequest,
+         *      resolve: (response: TranslationResponse),
+         *      reject: (error: Error)
+         *    }>
+         * }}
+         */
+        this.queue = [];
+
+        /**
+         * batch serial to help keep track of batches when debugging
+         * @type {Number}
+         */
+        this.batchSerial = 0;
+
+        /**
+         * Number of requests in a batch before it is ready to be translated in
+         * a single call. Bigger is better for throughput (better matrix packing)
+         * but worse for latency since you'll have to wait for the entire batch
+         * to be translated.
+         * @type {Number}
+         */
+        this.batchSize = Math.max(options?.batchSize || 8, 1);
+
+        this.onerror = options?.onerror || (err => console.error('WASM Translation Worker error:', err));
+    }
+    
+    /**
+     * Destructor that stops and cleans up.
+     */
+    async delete() {
+        // Empty the queue
+        this.remove(() => true);
+
+        // Terminate the workers
+        this.workers.forEach(({worker}) => worker.terminate());
+    }
+
+    /**
+     * Makes sure queued work gets send to a worker. Will delay it till `idle`
+     * to make sure the batches have been filled to some degree. Will keep
+     * calling itself as long as there is work in the queue, but it does not
+     * hurt to call it multiple times. This function always returns immediately.
+     */
+    notify() {
+        setTimeout(async () => {
+            // Is there work to be done?
+            if (!this.queue.length)
+                return;
+
+            // Find an idle worker
+            let worker = this.workers.find(worker => worker.idle);
+
+            // No worker free, but space for more?
+            if (!worker && this.workers.length < this.workerLimit) {
+                try {
+                    // Claim a place in the workers array (but mark it busy so
+                    // it doesn't get used by any other `notify()` calls).
+                    const placeholder = {idle: false};
+                    this.workers.push(placeholder);
+
+                    // adds `worker` and `exports` props
+                    Object.assign(placeholder, await this.backing.loadWorker());
+
+                    // At this point we know our new worker will be usable.
+                    worker = placeholder;
+                } catch (e) {
+                    this.onerror(new Error(`Could not initialise translation worker: ${e.message}`));
+                }
+            }
+
+            // If no worker, that's the end of it.
+            if (!worker)
+                return;
+
+            // Up to this point, this function has not used await, so no
+            // chance that another call stole our batch since we did the check
+            // at the beginning of this function and JavaScript is only
+            // cooperatively parallel.
+            const batch = this.queue.shift();
+
+            // Put this worker to work, marking as busy
+            worker.idle = false;
+            try {
+                await this.consumeBatch(batch, worker.exports);
+            } catch (e) {
+                batch.requests.forEach(({reject}) => reject(e));
+            }
+            worker.idle = true;
+
+            // Is there more work to be done? Do another idleRequest
+            if (this.queue.length)
+                this.notify();
+        });
+    }
+
+    /**
+     * The only real public call you need!
+     * ```
+     * const {target: {text:string}} = await this.translate({
+     *   from: 'de',
+     *   to: 'en',
+     *   text: 'Hallo Welt!',
+     *   html: false, // optional
+     *   priority: 0 // optional, like `nice` lower numbers are translated first
+     * })
+     * ```
+     * @param {TranslationRequest} request
+     * @returns {Promise<TranslationResponse>}
+     */
+    translate(request) {
+        const {from, to, priority} = request;
+
+        return new Promise(async (resolve, reject) => {
+            try {
+                // Batching key: only requests with the same key can be batched
+                // together. Think same translation model, same options.
+                const key = JSON.stringify({from, to});
+
+                // (Fetching models first because if we would do it between looking
+                // for a batch and making a new one, we end up with a race condition.)
+                const models = await this.backing.getModels(request);
+                
+                // Put the request and its callbacks into a fitting batch
+                this.enqueue({key, models, request, resolve, reject, priority});
+
+                // Tell a worker to pick up the work at some point.
+                this.notify();
+            } catch (e) {
+                reject(e);
+            }
+        });
+    }
+
+    /**
+     * Prune pending requests by testing each one of them to whether they're
+     * still relevant. Used to prune translation requests from tabs that got
+     * closed.
+     * @param {(request:TranslationRequest) => boolean} filter evaluates to true if request should be removed
+     */
+    remove(filter) {
+        const queue = this.queue;
+
+        this.queue = [];
+
+        queue.forEach(batch => {
+            batch.requests.forEach(({request, resolve, reject}) => {
+                if (filter(request)) {
+                    // Add error.request property to match response.request for
+                    // a resolve() callback. Pretty useful if you don't want to
+                    // do all kinds of Funcion.bind() dances.
+                    reject(Object.assign(new CancelledError('removed by filter'), {request}));
+                    return;
+                }
+
+                this.enqueue({
+                    key: batch.key,
+                    priority: batch.priority,
+                    models: batch.models,
+                    request,
+                    resolve,
+                    reject
+                });
+            });
+        });
+    }
+
+    /**
+     * Internal function used to put a request in a batch that still has space.
+     * Also responsible for keeping the batches in order of priority. Called by
+     * `translate()` but also used when filtering pending requests.
+     * @param {{request:TranslateRequest, models:TranslationModel[], key:String, priority:Number?, resolve:(TranslateResponse)=>any, reject:(Error)=>any}}
+     */
+    enqueue({key, models, request, resolve, reject, priority}) {
+        if (priority === undefined)
+            priority = 0;
+         // Find a batch in the queue that we can add to
+         // (TODO: can we search backwards? that would speed things up)
+        let batch = this.queue.find(batch => {
+            return batch.key === key
+                && batch.priority === priority
+                && batch.requests.length < this.batchSize
+        });
+
+        // No batch or full batch? Queue up a new one
+        if (!batch) {
+            batch = {id: ++this.batchSerial, key, priority, models, requests: []};
+            this.queue.push(batch);
+            this.queue.sort((a, b) => a.priority - b.priority);
+        }
+
+        batch.requests.push({request, resolve, reject});
+    }
+
+    /**
+     * Internal method that uses a worker thread to process a batch. You can
+     * wait for the batch to be done by awaiting this call. You should only
+     * then reuse the worker otherwise you'll just clog up its message queue.
+     */
+    async consumeBatch(batch, worker) {
+        performance.mark('BergamotBatchTranslator.start');
+
+        // Make sure the worker has all necessary models loaded. If not, tell it
+        // first to load them.
+        await Promise.all(batch.models.map(async ({from, to}) => {
+            if (!await worker.hasTranslationModel({from, to})) {
+                const buffers = await this.backing.getTranslationModel({from, to});
+                await worker.loadTranslationModel({from, to}, buffers);
+            }
+        }));
+
+        // Call the worker to translate. Only sending the actually necessary
+        // parts of the batch to avoid trying to send things that don't survive
+        // the message passing API between this thread and the worker thread.
+        const responses = await worker.translate({
+            models: batch.models.map(({from, to}) => ({from, to})),
+            texts: batch.requests.map(({request: {text, html, qualityScores}}) => ({
+                text: text.toString(),
+                html: !!html,
+                qualityScores: !!qualityScores
+            }))
+        });
+
+        // Responses are in! Connect them back to their requests and call their
+        // callbacks.
+        batch.requests.forEach(({request, resolve, reject}, i) => {
+            // TODO: look at response.ok and reject() if it is false
+            resolve({
+                request, // Include request for easy reference? Will allow you
+                         // to specify custom properties and use that to link
+                         // request & response back to each other.
+                ...responses[i] // {target: {text: String}}
+            });
+        });
+        
+        performance.measure('BergamotBatchTranslator', 'BergamotBatchTranslator.start');
+    }
+}
+
+
+/**
+ * Translator optimised for interactive use.
+ */
+export class LatencyOptimisedTranslator {
+    /**
+     * @type {TranslatorBacking}
+     */
+    backing;
+
+    /**
+     * @type {Promise<{idle:boolean, worker:Worker, exports:Proxy<TranslationWorker>}>}
+     */
+    worker;
+
+    /**
+     * @type {{request: TranslationRequest, accept:(TranslationResponse), reject:(Error)} | null}
+     */
+    pending;
+
+    /**
+     * @param {{
+     *  cacheSize?: number,
+     *  useNativeIntGemm?: boolean,
+     *  downloadTimeout?: number,
+     *  workerUrl?: string,
+     *  registryUrl?: string
+     *  pivotLanguage?: string?
+     * }} options
+     */
+    constructor(options, backing) {
+        if (!backing)
+            backing = new TranslatorBacking(options);
+
+        this.backing = backing;
+
+        // Exposing the this.loadWorker() returned promise through this.worker
+        // so that you can use that to catch any errors that happened during
+        // loading.
+        this.worker = this.backing.loadWorker().then(worker => ({...worker, idle:true}));
+    }
+
+    /**
+     * Destructor that stops and cleans up.
+     */
+    async delete() {
+        // Cancel pending translation
+        if (this.pending) {
+            this.pending.reject(new CancelledError('translator got deleted'));
+            this.pending = null;
+        }
+
+        // Terminate the worker (I don't care if this fails)
+        try {
+            const {worker} = await this.worker;
+            worker.terminate();
+        } finally {
+            this.worker = null;
+        }
+    }
+    
+    /**
+     * Sets `request` as the next translation to process. If there was already
+     * a translation waiting to be processed, their promise is rejected with a
+     * SupersededError.
+     * @param {TranslationRequest} request
+     * @return {Promise<TranslationResponse>}
+     */
+    translate(request, options) {
+        if (this.pending)
+            this.pending.reject(new SupersededError());
+        
+        return new Promise((accept, reject) => {
+            const pending = {request, accept, reject, options};
+
+            if (options?.signal) {
+                options.signal.addEventListener('abort', e => {
+                    reject(new CancelledError('abort signal'));
+                    if (this.pending === pending)
+                        this.pending = null;
+                });
+            }
+
+            this.pending = pending;
+            this.notify();
+        });
+    }
+    
+    notify() {
+        setTimeout(async () => {
+            if (!this.pending)
+                return;
+
+            // Catch errors such as the worker not working
+            try {
+                // Possibly wait for the worker to finish loading. After it loaded
+                // these calls are pretty much instantaneous.
+                const worker = await this.worker;
+
+                // Is another notify() call hogging the worker? Then stop.
+                if (!worker.idle)
+                    return;
+
+                // Claim the pending translation request.
+                const {request, accept, reject, options} = this.pending;
+                this.pending = null;
+
+                // Mark the worker as occupied
+                worker.idle = false;
+                    
+                try {
+                    const models = await this.backing.getModels(request)
+
+                    await Promise.all(models.map(async ({from, to}) => {
+                        if (!await worker.exports.hasTranslationModel({from, to})) {
+                            const buffers = await this.backing.getTranslationModel({from, to}, {signal: options?.signal});
+                            await worker.exports.loadTranslationModel({from, to}, buffers);
+                        }
+                    }));
+
+                    const {text, html, qualityScores} = request;
+                    const responses = await worker.exports.translate({
+                        models: models.map(({from,to}) => ({from, to})),
+                        texts: [{text, html, qualityScores}]
+                    });
+
+                    accept({request, ...responses[0]});
+                } catch (e) {
+                    reject(e);
+                }
+
+                worker.idle = true;
+
+                // Is there more work to be done? Do another idleRequest
+                if (this.pending)
+                    this.notify();
+            } catch (e) {
+                this.backing.onerror(e);
+            }
+        });
+    }
+}
diff --git a/inference/wasm/module/worker/package.json b/inference/wasm/module/worker/package.json
new file mode 100644
index 000000000..a2091cee3
--- /dev/null
+++ b/inference/wasm/module/worker/package.json
@@ -0,0 +1,3 @@
+{
+	"type": "commonjs"
+}
\ No newline at end of file
diff --git a/inference/wasm/module/worker/translator-worker.js b/inference/wasm/module/worker/translator-worker.js
new file mode 100644
index 000000000..24ff29ad2
--- /dev/null
+++ b/inference/wasm/module/worker/translator-worker.js
@@ -0,0 +1,475 @@
+/**
+ * Wrapper around the dirty bits of Bergamot's WASM bindings.
+ */
+
+// Global because importScripts is global.
+var Module = {};
+
+/**
+ * node.js compatibility: Fake GlobalWorkerScope that emulates being inside a
+ * WebWorker
+ */
+if (typeof self === 'undefined') {
+    global.Module = Module;
+
+    global.self = new class GlobalWorkerScope {
+        /** @type {import("node:worker_threads").MessagePort} */
+        #port;
+
+        constructor() {
+            const {parentPort} = require(/* webpackIgnore: true */ 'node:worker_threads');
+            this.#port = parentPort;
+        }
+
+        /**
+         * Add event listener to listen for messages posted to the worker.
+         * @param {string} eventName
+         * @param {(object)} callback
+         */
+        addEventListener(eventName, callback) {
+            this.#port.on(eventName, (data) => callback({data}));
+        }
+
+        /**
+         * Post message outside, to the owner of the Worker.
+         * @param {any} message
+         */
+        postMessage(message) {
+            this.#port.postMessage(message);
+        }
+
+        /**
+         * @param {...string} scripts - Paths to scripts to import in that order
+         */
+        importScripts(...scripts) {
+            const {readFileSync} = require(/* webpackIgnore: true */ 'node:fs');
+            const {join} = require(/* webpackIgnore: true */ 'node:path');
+            for (let pathname of scripts) {
+                const script = readFileSync(join(__dirname, pathname), {encoding: 'utf-8'});
+                eval.call(global, script);
+            }
+        }
+
+        /**
+         * Adds support for local file urls. Assumes anything that doesn't start
+         * with "http" to be a local path.
+         * @param {string} url - path or url
+         * @param {object?} options - See `fetch()` options
+         * @return {Promise<Response>}
+         */
+        async fetch(url, options) {
+            if (url.protocol === 'file:') {
+                const {readFile} = require(/* webpackIgnore: true */ 'node:fs/promises');
+                const buffer = await readFile(url.pathname);
+                const blob = new Blob([buffer]);
+                return new Response(blob, {
+                    status: 200,
+                    statusText: 'OK',
+                    headers: {
+                        'Content-Type': 'application/wasm',
+                        'Content-Length': blob.size.toString()
+                    }
+                });
+            }
+
+            return await fetch(url, options);
+        }
+
+        get location() {
+            return new URL(`file://${__filename}`);
+        }
+    }
+}
+
+class YAML {
+    /**
+     * Parses YAML into dictionary. Does not interpret types, all values are a
+     * string or a list of strings. No support for objects other than the top
+     * level.
+     * @param {string} yaml
+     * @return {{[string]: string | string[]}}
+     */
+    static parse(yaml) {
+        const out = {};
+
+        yaml.split('\n').reduce((key, line, i) => {
+            let match;
+            if (match = line.match(/^\s*-\s+(.+?)$/)) {
+                if (!Array.isArray(out[key]))
+                    out[key] = out[key].trim() ? [out[key]] : [];
+                out[key].push(match[1].trim());
+            }
+            else if (match = line.match(/^\s*([A-Za-z0-9_][A-Za-z0-9_-]*):\s*(.*)$/)) {
+                key = match[1];
+                out[key] = match[2].trim();
+            }
+            else if (!line.trim()) {
+                // whitespace, ignore
+            }
+            else {
+                throw Error(`Could not parse line ${i+1}: "${line}"`);
+            }
+            return key;
+        }, null);
+
+        return out;
+    }
+
+    /**
+     * Turns an object into a YAML string. No support for objects, only simple
+     * types and lists of simple types.
+     * @param {{[string]: string | number | boolean | string[]}} data
+     * @return {string}
+     */
+    static stringify(data) {
+        return Object.entries(data).reduce((str, [key, value]) => {
+            let valstr = '';
+            if (Array.isArray(value))
+                valstr = value.map(val => `\n  - ${val}`).join('');
+            else if (typeof value === 'number' || typeof value === 'boolean' || value.match(/^\d*(\.\d+)?$/))
+                valstr = `${value}`;
+            else
+                valstr = `${value}`; // Quote?
+
+            return `${str}${key}: ${valstr}\n`;
+        }, '');
+    }
+}
+
+/**
+ * Wrapper around the bergamot-translator exported module that hides the need
+ * of working with C++ style data structures and does model management.
+ */
+class BergamotTranslatorWorker {
+    /**
+     * Map of expected symbol -> name of fallback symbol for functions that can
+     * be swizzled for a faster implementation. Firefox Nightly makes use of
+     * this.
+     */
+    static GEMM_TO_FALLBACK_FUNCTIONS_MAP = {
+        'int8_prepare_a': 'int8PrepareAFallback',
+        'int8_prepare_b': 'int8PrepareBFallback',
+        'int8_prepare_b_from_transposed': 'int8PrepareBFromTransposedFallback',
+        'int8_prepare_b_from_quantized_transposed': 'int8PrepareBFromQuantizedTransposedFallback',
+        'int8_prepare_bias': 'int8PrepareBiasFallback',
+        'int8_multiply_and_add_bias': 'int8MultiplyAndAddBiasFallback',
+        'int8_select_columns_of_b': 'int8SelectColumnsOfBFallback'
+    };
+
+    /**
+     * Name of module exported by Firefox Nightly that exports an optimised
+     * implementation of the symbols mentioned above.
+     */
+    static NATIVE_INT_GEMM = 'mozIntGemm';
+
+    /**
+     * Empty because we can't do async constructors yet. It is the
+     * responsibility of whoever owns this WebWorker to call `initialize()`.
+     */
+    constructor(options) {}
+
+    /**
+     * Instantiates a new translation worker with optional options object.
+     * If this call succeeds, the WASM runtime is loaded and ready.
+     * 
+     * Available options are:
+     *   useNativeIntGemm: {true | false} defaults to false. If true, it will
+     *                     attempt to link to the intgemm module available in
+     *                     Firefox Nightly which makes translations much faster.
+     *          cacheSize: {Number} defaults to 0 which disables translation
+     *                     cache entirely. Note that this is a theoretical
+     *                     upper bound. In practice it will use about 1/3th of
+     *                     the cache specified here. 2^14 is not a bad starting
+     *                     value.
+     * @param {{useNativeIntGemm: boolean, cacheSize: number}} options
+     */
+    async initialize(options) {
+        this.options = options || {};
+        this.models = new Map(); // Map<str,Promise<TranslationModel>>
+        this.module = await this.loadModule();
+        this.service = await this.loadTranslationService();
+    }
+
+    /**
+     * Tries to load native IntGEMM module for bergamot-translator. If that
+     * fails because it or any of the expected functions is not available, it
+     * falls back to using the naive implementations that come with the wasm
+     * binary itself through `linkFallbackIntGemm()`.
+     * @param {{env: {memory: WebAssembly.Memory}}} info
+     * @return {{[method:string]: (...any) => any}}
+     */
+    linkNativeIntGemm(info) {
+        if (!WebAssembly['mozIntGemm']) {
+            console.warn('Native gemm requested but not available, falling back to embedded gemm');
+            return this.linkFallbackIntGemm(info);
+        }
+
+        const instance = new WebAssembly.Instance(WebAssembly['mozIntGemm'](), {
+            '': {memory: info['env']['memory']}
+        });
+
+        if (!Array.from(Object.keys(BergamotTranslatorWorker.GEMM_TO_FALLBACK_FUNCTIONS_MAP)).every(fun => instance.exports[fun])) {
+            console.warn('Native gemm is missing expected functions, falling back to embedded gemm');
+            return this.linkFallbackIntGemm(info);
+        }
+
+        return instance.exports;
+    }
+
+    /**
+     * Links intgemm functions that are already available in the wasm binary,
+     * but just exports them under the name that is expected by
+     * bergamot-translator.
+     * @param {{env: {memory: WebAssembly.Memory}}} info
+     * @return {{[method:string]: (...any) => any}}
+     */
+    linkFallbackIntGemm(info) {
+        const mapping = Object.entries(BergamotTranslatorWorker.GEMM_TO_FALLBACK_FUNCTIONS_MAP).map(([key, name]) => {
+            return [key, (...args) => Module['asm'][name](...args)]
+        });
+
+        return Object.fromEntries(mapping);
+    }
+
+    /**
+     * Internal method. Reads and instantiates the WASM binary. Returns a
+     * promise for the exported Module object that contains all the classes
+     * and functions exported by bergamot-translator.
+     * @return {Promise<BergamotTranslator>}
+     */
+    loadModule() {
+        return new Promise(async (resolve, reject) => {
+            try {
+                const response = await self.fetch(new URL('./bergamot-translator-worker.wasm', self.location));
+
+                Object.assign(Module, {
+                    instantiateWasm: (info, accept) => {
+                        try {
+                            WebAssembly.instantiateStreaming(response, {
+                                ...info,
+                                'wasm_gemm': this.options.useNativeIntGemm
+                                    ? this.linkNativeIntGemm(info)
+                                    : this.linkFallbackIntGemm(info)
+                            }).then(({instance}) => accept(instance)).catch(reject);
+                        } catch (err) {
+                            reject(err);
+                        }
+                        return {};
+                    },
+                    onRuntimeInitialized: () => {
+                        resolve(Module);
+                    }
+                });
+
+                // Emscripten glue code. Webpack et al. should not mangle the `Module` property name!
+                self.Module = Module;
+                self.importScripts('bergamot-translator-worker.js');
+            } catch (err) {
+                reject(err);
+            }
+        });
+    }
+
+    /**
+     * Internal method. Instantiates a BlockingService()
+     * @return {BergamotTranslator.BlockingService}
+     */
+    loadTranslationService() {
+        return new this.module.BlockingService({
+            cacheSize: Math.max(this.options.cacheSize || 0, 0)
+        });
+    }
+
+    /**
+     * Returns whether a model has already been loaded in this worker. Marked
+     * async because the message passing interface we use expects async methods.
+     * @param {{from:string, to:string}}
+     * @return boolean
+     */ 
+    hasTranslationModel({from,to}) {
+        const key = JSON.stringify({from,to});
+        return this.models.has(key);
+    }
+
+    /**
+     * Loads a translation model from a set of file buffers. After this, the
+     * model is available to translate with and `hasTranslationModel()` will
+     * return true for this pair.
+     * @param {{from:string, to:string}}
+     * @param {{
+     *   model: ArrayBuffer,
+     *   shortlist: ArrayBuffer,
+     *   vocabs: ArrayBuffer[],
+     *   qualityModel: ArrayBuffer?,
+     *   config?: {
+     *     [key:string]: string
+     *   }
+     * }} buffers
+     */ 
+    loadTranslationModel({from, to}, buffers) {
+        // This because service_bindings.cpp:prepareVocabsSmartMemories :(
+        const uniqueVocabs = buffers.vocabs.filter((vocab, index, vocabs) => {
+            return !vocabs.slice(0, index).includes(vocab);
+        });
+
+        const [modelMemory, shortlistMemory, qualityModel, ...vocabMemory] = [
+            this.prepareAlignedMemoryFromBuffer(buffers.model, 256),
+            this.prepareAlignedMemoryFromBuffer(buffers.shortlist, 64),
+            buffers.qualityModel // optional quality model
+                ? this.prepareAlignedMemoryFromBuffer(buffers.qualityModel, 64)
+                : null,
+            ...uniqueVocabs.map(vocab => this.prepareAlignedMemoryFromBuffer(vocab, 64))
+        ];
+
+        const vocabs = new this.module.AlignedMemoryList();
+        vocabMemory.forEach(vocab => vocabs.push_back(vocab));
+
+        // Defaults
+        let modelConfig = YAML.parse(`
+            beam-size: 1
+            normalize: 1.0
+            word-penalty: 0
+            cpu-threads: 0
+            gemm-precision: int8shiftAlphaAll
+            skip-cost: true
+        `);
+
+        if (buffers.config)
+            Object.assign(modelConfig, buffers.config);
+
+        // WASM marian is only compiled with support for shiftedAll.
+        if (modelConfig['gemm-precision'] === 'int8')
+            modelConfig['gemm-precision'] = 'int8shiftAll';
+
+        // Override these
+        Object.assign(modelConfig, YAML.parse(`
+            alignment: soft
+            quiet: true
+            quiet-translation: true
+            max-length-break: 128
+            mini-batch-words: 1024
+            workspace: 128
+            max-length-factor: 2.0
+        `));
+
+        const key = JSON.stringify({from,to});
+        this.models.set(key, new this.module.TranslationModel(YAML.stringify(modelConfig), modelMemory, shortlistMemory, vocabs, qualityModel));
+    }
+
+    /**
+     * Frees up memory used by old translation model. Does nothing if model is
+     * already deleted.
+     * @param {{from:string, to:string}}
+     */
+    freeTranslationModel({from, to}) {
+        const key = JSON.stringify({from,to});
+        
+        if (!this.models.has(key))
+            return;
+        
+        const model = this.models.get(key);
+        this.models.delete(key);
+
+        model.delete();
+    }
+
+    /**
+     * Internal function. Copies the data from an ArrayBuffer into memory that
+     * can be used inside the WASM vm by Marian.
+     * @param {{ArrayBuffer}} buffer
+     * @param {number} alignmentSize
+     * @return {BergamotTranslator.AlignedMemory}
+     */
+    prepareAlignedMemoryFromBuffer(buffer, alignmentSize) {
+        const bytes = new Int8Array(buffer);
+        const memory = new this.module.AlignedMemory(bytes.byteLength, alignmentSize);
+        memory.getByteArrayView().set(bytes);
+        return memory;
+    }
+
+    /**
+     * Public. Does actual translation work. You have to make sure that the
+     * models necessary for translating text are already loaded before calling
+     * this method. Returns a promise with translation responses.
+     * @param {{models: {from:string, to:string}[], texts: {text: string, html: boolean}[]}}
+     * @return {Promise<{target: {text: string}}[]>}
+     */
+    translate({models, texts}) {
+        // Convert texts array into a std::vector<std::string>.
+        let input = new this.module.VectorString();
+        texts.forEach(({text}) => input.push_back(text));
+
+        // Extracts the texts[].html options into ResponseOption objects
+        let options = new this.module.VectorResponseOptions();
+        texts.forEach(({html, qualityScores}) => options.push_back({alignment: false, html, qualityScores}));
+
+        // Turn our model names into a list of TranslationModel pointers
+        const translationModels = models.map(({from,to}) => {
+            const key = JSON.stringify({from,to});
+            return this.models.get(key);
+        });
+
+        // translate the input, which is a vector<String>; the result is a vector<Response>
+        const responses = models.length > 1
+            ? this.service.translateViaPivoting(...translationModels, input, options)
+            : this.service.translate(...translationModels, input, options);
+        
+        input.delete();
+        options.delete();
+
+        // Convert the Response WASM wrappers into native JavaScript types we
+        // can send over the 'wire' (message passing) in the same format as we
+        // use in bergamot-translator.
+        const translations = texts.map((_, i) => ({
+            target: {
+                text: responses.get(i).getTranslatedText()
+            }
+        }));
+
+        responses.delete();
+
+        return translations;
+    }
+}
+
+/**
+ * Because you can't put an Error object in a message. But you can post a
+ * generic object!
+ * @param {Error} error
+ * @return {{
+ *  name: string?,
+ *  message: string?,
+ *  stack: string?
+ * }}
+ */
+function cloneError(error) {
+    return {
+        name: error.name,
+        message: error.message,
+        stack: error.stack
+    };
+}
+
+// (Constructor doesn't really do anything, we need to call `initialize()`
+// first before using it. That happens from outside the worker.)
+const worker = new BergamotTranslatorWorker();
+
+self.addEventListener('message', async function({data: {id, name, args}}) {
+    if (!id)
+        console.error('Received message without id', arguments[0]);
+
+    try {
+        if (typeof worker[name] !== 'function')
+            throw TypeError(`worker[${name}] is not a function`);
+
+        // Using `Promise.resolve` to await any promises that worker[name]
+        // possibly returns.
+        const result = await Promise.resolve(Reflect.apply(worker[name], worker, args));
+        self.postMessage({id, result});
+    } catch (error) {
+        self.postMessage({
+            id,
+            error: cloneError(error)
+        })
+    }
+});
diff --git a/inference/wasm/node-test.js b/inference/wasm/node-test.js
new file mode 100755
index 000000000..8734858f5
--- /dev/null
+++ b/inference/wasm/node-test.js
@@ -0,0 +1,175 @@
+#!/usr/bin/env node
+
+/**
+ * A note upfront: the bergamot-translator API is pretty low level, and
+ * embedding it successfully requires some knowledge about the WebWorkers and
+ * WebAssembly APIs. This script tries to demonstrate the bergamot-translator
+ * API with as little of that boiler plate code as possible.
+ * See the wasm/test_page code for a fully fleshed out demo in a web context.
+ */
+
+// For node we use the fs module to read local files. In a web context you can
+// use `fetch()` for everything.
+const fs = require('fs');
+
+// Read wasm binary into a blob, which will be loaded by
+// bergamot-translator-worker.js in a minute. In a web context, you'd be using
+// `fetch(...).then(response => response.blob())` for this, but Node does not
+// implement `fetch("file://...")` yet.
+const wasmBinary = fs.readFileSync('./bergamot-translator-worker.wasm');
+
+// Read wasm runtime code that bridges the bergmot-translator binary with JS.
+const wasmRuntime = fs.readFileSync('./bergamot-translator-worker.js', {encoding: 'utf8'});
+
+// Initialise the `Module` object. By adding methods and options to this, we can
+// affect how bergamot-translator interacts with JavaScript. See 
+// https://emscripten.org/docs/api_reference/module.html for all available
+// options. It is important that this object is initialised in the same scope
+// but before `bergamot-translation-worker.js` is executed. Once that script
+// executes, it defines the exported methods as properties of this Module
+// object.
+global.Module = {
+  wasmBinary,
+  onRuntimeInitialized
+};
+
+// Execute bergamot-translation-worker.js in this scope. This will also,
+// indirectly, call the onRuntimeInitialized function defined below and
+// referenced in the `Module` object above.
+eval.call(global, wasmRuntime);
+
+/**
+ * Called from inside the bergamot-translation-worker.js script once the wasm
+ * module is initialized. At this point that `Module` object that was
+ * initialised above will have all the classes defined in the
+ * bergamot-translator API available on it.
+ */
+async function onRuntimeInitialized() {
+  // Root url for our models for now.
+  const root = 'https://storage.googleapis.com/bergamot-models-sandbox/0.3.1';
+
+  // Urls of data files necessary to create a translation model for
+  // English -> German. Note: list is in order of TranslationModel's arguments.
+  // The `alignment` value is used later on to load each part of the model with
+  // the correct alignment.
+  const files = [
+    // Neural network and weights:
+    {url: `${root}/ende/model.ende.intgemm.alphas.bin`, alignment: 256},
+    
+    // Lexical shortlist which is mainly a speed improvement method, not
+    // strictly necessary:
+    {url: `${root}/ende/lex.50.50.ende.s2t.bin`, alignment: 64},
+    
+    // Vocabulary, maps the input and output nodes of the neural network to
+    // strings. Note: "deen" may look the wrong way around but vocab is the same
+    // between de->en and en->de models.
+    {url: `${root}/ende/vocab.deen.spm`, alignment: 64},
+  ];
+
+  // Download model data and load it into aligned memory. AlignedMemory is a
+  // necessary wrapper around allocated memory inside the WASM environment.
+  // The value of `alignment` is specific for which part of the model we're
+  // loading. See https://en.wikipedia.org/wiki/Data_structure_alignment for a
+  // more general explanation.
+  const [modelMem, shortlistMem, vocabMem] = await Promise.all(files.map(async (file) => {
+    const response = await fetch(file.url);
+    const blob = await response.blob();
+    const buffer = await blob.arrayBuffer();
+    const bytes = new Int8Array(buffer);
+    const memory = new Module.AlignedMemory(bytes.byteLength, file.alignment);
+    memory.getByteArrayView().set(bytes);
+    return memory;
+  }));
+
+  // Set up translation service. This service translates a batch of text per
+  // call. The larger the batch, the faster the translation (in words per
+  // second) happens, but the longer you have to wait for all of them to finish.
+  // The constructor expects an object with options, but only one option is
+  // currently supported: `cacheSize`. Setting this to `0` disables the
+  // translation cache.
+  // **Note**: cacheSize is the theoretical maximum number of sentences that
+  // will be cached. In practise, about 1/3 of that will actually be used.
+  // See https://github.com/XapaJIaMnu/translateLocally/pull/75
+  const service = new Module.BlockingService({cacheSize: 0});
+
+  // Put vocab into its own std::vector<AlignedMemory>. Most models for the
+  // Bergamot project only have one vocabulary that is shared by both the input
+  // and output side of the translator. But in theory, you could have one for
+  // the input side and a different one for the output side. Hence: a list.
+  const vocabs = new Module.AlignedMemoryList();
+  vocabs.push_back(vocabMem);
+
+  // Config yaml (split as array to allow for indentation without adding tabs
+  // or spaces to the strings themselves.)
+  // See https://marian-nmt.github.io/docs/cmd/marian-decoder/ for the meaning
+  // of most of these options and what other options might be available.
+  const config = [
+    'beam-size: 1',
+    'normalize: 1.0',
+    'word-penalty: 0',
+    'alignment: soft', // is necessary if you want to use HTML at any point
+    'max-length-break: 128',
+    'mini-batch-words: 1024',
+    'workspace: 128',
+    'max-length-factor: 2.0',
+    'skip-cost: true',
+    'gemm-precision: int8shiftAll', // is necessary for speed and compatibility with Mozilla's models.
+  ].join('\n');
+
+  // Setup up model with config yaml and AlignedMemory objects. Optionally a
+  // quality estimation model can also be loaded but this is not demonstrated
+  // here. Generally you don't need it, and many models don't include the data
+  // file necessary to use it anyway.
+  const model = new Module.TranslationModel(config, modelMem, shortlistMem, vocabs, /*qualityModel=*/ null);
+
+  // Construct std::vector<std::string> inputs; This is our batch!
+  const input = new Module.VectorString();
+  input.push_back('<p>Hello world! Let us write a second sentence.</p> &amp; <p>Goodbye World!</p>');
+  input.push_back('This is a second example without HTML & entities.');
+
+  // Construct std::vector<ResponseOptions>, one entry per input. Note that
+  // all these three properties of your ResponseOptions object need to be
+  // specified for each entry.
+  // `qualityScores`: related to quality models not explained here. Set this
+  //   to `false`.
+  // `alignment`: computes alignment scores that maps parts of the input text
+  //   to parts of the output text. There is currently no way to get these
+  //   mappings out through the JavaScript API so I suggest you set this to
+  //   `false` as well.
+  // `html`: is the input HTML? If so, the HTML will be parsed and the markup
+  //   will be copied back into the translated output. Note: HTML has to be
+  //   valid HTML5, with proper closing tags and everything since the HTML
+  //   parser built into bergamot-translator does no error correction. Output
+  //   of e.g. `Element.innerHTML` meets this criteria.
+  const options = new Module.VectorResponseOptions();
+  options.push_back({qualityScores: false, alignment: false, html: true});
+  options.push_back({qualityScores: false, alignment: false, html: false});
+
+  // Size of `input` and `options` has to match.
+  console.assert(input.size() === options.size());
+
+  // Translate our batch of 2 requests. Output will be another vector of type 
+  // `std::vector<Response>`.
+  const output = service.translate(model, input, options);
+
+  console.assert(false);
+
+  // Number of outputs is number of inputs.
+  console.assert(input.size() === output.size());
+
+  for (let i = 0; i < output.size(); ++i) {
+    // Get output from std::vector<Response>.
+    const translation = output.get(i).getTranslatedText();
+
+    // Print raw translation for inspection.
+    console.log(translation)
+  }
+
+  // Clean-up: unlike the objects in JavaScript, the objects in the WASM
+  // environment are not automatically cleaned up when they're no longer
+  // referenced. That is why we manually have to call `delete()` on them
+  // when we're done with them.
+  input.delete();
+  options.delete();
+  output.delete();
+}
diff --git a/inference/wasm/patch-artifacts-import-gemm-module.sh b/inference/wasm/patch-artifacts-import-gemm-module.sh
new file mode 100644
index 000000000..d9fa648fe
--- /dev/null
+++ b/inference/wasm/patch-artifacts-import-gemm-module.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+usage="Patch wasm artifacts to import gemm implementation for wasm.
+
+Usage: $(basename "$0") [ARTIFACTS_FOLDER]
+
+    where:
+    ARTIFACTS_FOLDER    Folder containing wasm artifacts
+                             (An optional argument, if unspecified the default is: current folder)"
+
+if [ "$#" -gt 1 ]; then
+    echo "Illegal number of parameters passed"
+    echo "$usage"
+    exit
+fi
+
+# Parse wasm artifacts folder if provided via script argument or set it to default
+ARTIFACTS_FOLDER=$PWD
+if [ "$#" -eq 1 ]; then
+    if [ ! -e "$1" ]; then
+        echo "Error: Folder \""$1"\" doesn't exist"
+        exit
+    fi
+    ARTIFACTS_FOLDER="$1"
+fi
+
+ARTIFACT="$ARTIFACTS_FOLDER/bergamot-translator-worker.js"
+if [ ! -e "$ARTIFACT" ]; then
+    echo "Error: Artifact \"$ARTIFACT\" doesn't exist"
+    exit
+fi
+
+echo "Importing integer (8-bit) gemm implementation"
+SCRIPT_ABSOLUTE_PATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+sed -i.bak 's/"env"[[:space:]]*:[[:space:]]*asmLibraryArg,/"env": asmLibraryArg,\
+    "wasm_gemm": createWasmGemm(),/g' ${ARTIFACT}
+cat $SCRIPT_ABSOLUTE_PATH/import-gemm-module.js >> ${ARTIFACT}
+echo "SUCCESS"
diff --git a/inference/wasm/project_version.js.in b/inference/wasm/project_version.js.in
new file mode 100644
index 000000000..9a4095f11
--- /dev/null
+++ b/inference/wasm/project_version.js.in
@@ -0,0 +1 @@
+var BERGAMOT_VERSION_FULL = "@PROJECT_VERSION_STRING_FULL@";
\ No newline at end of file
diff --git a/inference/wasm/test_page/bergamot-httpserver.js b/inference/wasm/test_page/bergamot-httpserver.js
new file mode 100644
index 000000000..273235e11
--- /dev/null
+++ b/inference/wasm/test_page/bergamot-httpserver.js
@@ -0,0 +1,93 @@
+const http = require('http');
+const https = require('https')
+const express = require('express');
+const app = express();
+const server = http.createServer(app);
+const fs = require('fs');
+const url = require('url');
+const nocache = require('nocache');
+const cors = require('cors');
+const path = require('path');
+
+let port = 8000;
+if (process.argv[2]) {
+    port = process.argv[2];
+}
+
+let skipssl = 0;
+if (process.argv[3]) {
+    skipssl = process.argv[3];
+}
+
+let certpath = "/etc/letsencrypt";
+if (process.argv[4]) {
+    certpath = process.argv[4];
+}
+
+app.use(cors())
+app.use(nocache());
+
+app.get('/', cors(), function(req, res) {
+    if (!req.secure && skipssl != 1) {
+        return res.redirect("https://" + req.headers.host + req.url);
+    }
+    res.sendFile(path.join(__dirname + '/index.html'));
+    res.header('Cross-Origin-Embedder-Policy','require-corp');
+    res.header('Cross-Origin-Opener-Policy','same-origin');
+    res.header('Cross-Origin-Resource-Policy','same-origin');
+});
+
+app.get('/*.*' , cors(), function(req, res) {
+    var options = url.parse(req.url, true);
+    var mime = Helper.getMime(options);
+    serveFile(res, options.pathname, mime);
+});
+
+function serveFile(res, pathName, mime) {
+    mime = mime || 'text/html';
+    fs.readFile(__dirname + '/' + pathName, function (err, data) {
+        if (err) {
+            res.writeHead(500, {"Content-Type": "text/plain"});
+            return res.end('Error loading ' + pathName + " with Error: " + err);
+        }
+        res.header('Cross-Origin-Embedder-Policy','require-corp');
+        res.header('Cross-Origin-Opener-Policy','same-origin');
+        res.header('Cross-Origin-Resource-Policy','same-origin');
+        res.writeHead(200, {"Content-Type": mime});
+        res.end(data);
+    });
+}
+
+if (skipssl != 1){
+    https.createServer({
+            key: fs.readFileSync(`${certpath}/privkey.pem`),
+            cert: fs.readFileSync(`${certpath}/cert.pem`),
+            ca: fs.readFileSync(`${certpath}/chain.pem`),
+        },
+        app
+    ).listen(443, () => {
+        console.log('Listening https port 443')
+    })
+}
+
+const Helper = {
+    types: {
+       "wasm" : "application/wasm"
+       , "js" : "application/javascript"
+       , "html" : "text/html"
+       , "htm" : "text/html"
+       , "ico" : "image/vnd.microsoft.icon"
+       , "css" : "text/css"
+    },
+    getMime: function(u) {
+        var ext = this.getExt(u.pathname).replace('.', '');
+        return this.types[ext.toLowerCase()] || 'application/octet-stream';
+    },
+    getExt: function(path) {
+        var i = path.lastIndexOf('.');
+        return (i < 0) ? '' : path.substr(i);
+    }
+};
+
+server.listen(port);
+console.log(`HTTP and BinaryJS server started on port ${port}`);
\ No newline at end of file
diff --git a/inference/wasm/test_page/css/index.css b/inference/wasm/test_page/css/index.css
new file mode 100644
index 000000000..11521a156
--- /dev/null
+++ b/inference/wasm/test_page/css/index.css
@@ -0,0 +1,168 @@
+* {
+  box-sizing: border-box;
+}
+
+html,
+body {
+  height: 100%;
+  margin: 0;
+  font-size: 18px;
+  font-family: Optima, Helvetica, Arial;
+}
+
+body {
+  padding: 1rem;
+}
+
+[hidden] {
+  display: none;
+}
+
+.app {
+  padding: 1rem;
+  display: grid;
+  grid: "from swap to" auto "credits credits credits" min-content / 1fr auto 1fr;
+  grid-gap: 1rem;
+  overflow: hidden;
+  min-height: 100%;
+  max-width: 1024px;
+  margin: 0 auto;
+}
+
+.swap::before {
+  display: inline-block;
+  content: '↔️';
+}
+
+@media screen and (max-width: 640px) {
+  .app {
+    grid: "from from" auto "swap swap" auto "to to" auto "credits credits" auto / 1fr;
+  }
+
+  .swap::before {
+    content: '↕️';
+  }
+}
+
+.panel {
+  display: grid;
+  grid-template-rows: auto 1fr;
+  grid-gap: 1rem;
+  max-height: 100%;
+  overflow: hidden;
+}
+
+label {
+  padding: 0 0.5em;
+  display: flex;
+  align-items: center;
+}
+
+.lang-select {
+  padding: 0.25rem 0.5rem;
+  margin-left: 1rem;
+  background: #f4f4f4;
+  font-size: 0.9rem;
+  border: 1px solid #ccc;
+  border-radius: 0.25rem;
+  cursor: pointer;
+}
+
+.panel--from {
+  grid-area: from;
+}
+
+.panel--to {
+  grid-area: to;
+}
+
+.swap {
+  align-self: center;
+  grid-area: swap;
+  font-size: 1.1rem;
+}
+
+.credits {
+  grid-area: credits;
+}
+
+.credits img {
+  float: left;
+  margin: 1em 0;
+}
+
+textarea, [contenteditable], .output-area {
+  padding: 1rem;
+  font-family: sans-serif;
+  font-size: 1rem;
+  resize: none;
+  border-radius: 2px;
+  border: 1px solid #ccc;
+  min-height: 100px;
+  max-height: 100%;
+  overflow: auto;
+}
+
+button {
+  cursor: pointer;
+  border: 1px solid #88c;
+  border-radius: 4px;
+  background: #eef;
+  padding: 0;
+  padding: 0.25rem 0.5rem;
+}
+button:hover {
+  background: #cce;
+}
+
+#output {
+  background-color: #f4f4f4;
+  position: relative;
+}
+
+.output-area [x-bergamot-word-score].bad {
+  background-image:
+    linear-gradient(45deg, transparent 65%, red 80%, transparent 90%),
+    linear-gradient(135deg, transparent 5%, red 15%, transparent 25%),
+    linear-gradient(135deg, transparent 45%, red 55%, transparent 65%),
+    linear-gradient(45deg, transparent 25%, red 35%, transparent 50%);
+  background-repeat:repeat-x;
+  background-size: 8px 2px;
+  background-position:0 95%;
+}
+
+.output-area [x-bergamot-sentence-score].bad {
+  background: rgba(255, 128, 128, 0.8);
+}
+
+.output-area [x-bergamot-sentence-index].highlight-sentence {
+  background: rgba(255, 255, 128, 0.8);
+}
+
+.app.translating #output::after {
+  position: absolute;
+  bottom: 4px;
+  right: 4px;
+  content: 'Translating…';
+}
+
+/* Loading indicator takes priority, so below the .translating selector */
+.app.loading #output::after {
+  position: absolute;
+  bottom: 4px;
+  right: 4px;
+  content: 'Loading translation model…';
+}
+
+.app {
+  position: relative;
+}
+
+#unsupported-browser {
+  position: absolute;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  background: white;
+}
diff --git a/inference/wasm/test_page/index.html b/inference/wasm/test_page/index.html
new file mode 100644
index 000000000..124857e23
--- /dev/null
+++ b/inference/wasm/test_page/index.html
@@ -0,0 +1,41 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Bergamot Translations</title>
+    <link rel="stylesheet" href="css/index.css" />
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, initial-scale=1.0, viewport-fit=cover"
+    />
+  </head>
+  <body>
+    <div class="app">
+      <div class="panel panel--from">
+        <label>
+          From
+          <select id="lang-from" name="from" class="lang-select"></select>
+        </label>
+        <div id="input" contenteditable="true"></div>
+      </div>
+      <button class="swap" title="swap"></button>
+      <div class="panel panel--to">
+        <label>
+          To
+          <select id="lang-to" name="to" class="lang-select"></select>
+        </label>
+        <div id="output" class="output-area"></div>
+      </div>
+      <div id="unsupported-browser" hidden>
+        <p>Your CPU or browser is not able to run Bergamot translator.</p>
+        <p>Try using Firefox or a Chromium based browser with <a href="https://webassembly.org/roadmap/">Fixed-width SIMD support</a>.</p>
+        <p>If you already are, you might be using a CPU that does not have support for SSE4.1 instructions.</p>
+      </div>
+      <footer class="credits">
+        <img src="logos.png" alt="Logos of the OPUS project, the Bergamot project and the European Union.">
+        <p>This project has received funding from the European Union’s Horizon 2020 research and innovation programme under grant agreement No 825303.</p>
+      </footer>
+    </div>
+    <script type="module" src="js/index.js"></script>
+  </body>
+</html>
diff --git a/inference/wasm/test_page/js/index.js b/inference/wasm/test_page/js/index.js
new file mode 100644
index 000000000..56cbfdc72
--- /dev/null
+++ b/inference/wasm/test_page/js/index.js
@@ -0,0 +1,215 @@
+import {LatencyOptimisedTranslator, TranslatorBacking, CancelledError, SupersededError} from '../node_modules/@browsermt/bergamot-translator/translator.js';
+
+function $(selector) {
+  return document.querySelector(selector);
+}
+
+function $$(selector) {
+  return document.querySelectorAll(selector);
+}
+
+function encodeHTML(text) {
+  const div = document.createElement('div');
+  div.appendChild(document.createTextNode(text));
+  return div.innerHTML;
+}
+
+function addQualityIndicators() {
+  $$('#output [x-bergamot-sentence-score]').forEach(el => {
+    // The threshold is ln(0.5) (https://github.com/browsermt/bergamot-translator/pull/370#issuecomment-1058123399)
+    el.classList.toggle('bad', parseFloat(el.getAttribute('x-bergamot-sentence-score')) < Math.log(0.5));
+  });
+
+  $$('#output [x-bergamot-word-score]').forEach(el => {
+    // The threshold is ln(0.5) (https://github.com/browsermt/bergamot-translator/pull/370#issuecomment-1058123399)
+    el.classList.toggle('bad', parseFloat(el.getAttribute('x-bergamot-word-score')) < Math.log(0.5));
+  });
+
+  // Add tooltips to each (sub)word with sentence and word score.
+  $$('#output [x-bergamot-sentence-score] > [x-bergamot-word-score]').forEach(el => {
+    const sentenceScore = parseFloat(el.parentNode.getAttribute('x-bergamot-sentence-score'));
+    const wordScore = parseFloat(el.getAttribute('x-bergamot-word-score'));
+    el.title = `Sentence: ${Math.exp(sentenceScore).toFixed(2)}  Word: ${Math.exp(wordScore).toFixed(2)}`;
+  });
+}
+
+function highlightSentence(element) {
+  const sentence = element.parentNode.hasAttribute('x-bergamot-sentence-index')
+    ? element.parentNode.getAttribute('x-bergamot-sentence-index')
+    : null;
+  $$('#output font[x-bergamot-sentence-index]').forEach(el => {
+    el.classList.toggle('highlight-sentence', el.getAttribute('x-bergamot-sentence-index') === sentence);
+  })
+}
+
+/**
+ * Very minimal WISYWIG editor. Just keyboard shortcuts for the IYKYK crowd.
+ */
+class Editor {
+  constructor(root) {
+    this.isApple = window.navigator.platform.startsWith('Mac');
+
+    this.root = root;
+    this.root.addEventListener('keydown', this.onkeydown.bind(this));
+
+    this.mapping = {
+      "b": "bold",
+      "i": "italic",
+      "u": "underline",
+    };
+  }
+
+  onkeydown(event) {
+    if (!(this.isApple ? event.metaKey : event.ctrlKey))
+      return;
+
+    if (!(event.key in this.mapping))
+      return;
+
+    document.execCommand(this.mapping[event.key], false, null);
+
+    event.preventDefault();
+  }
+}
+
+async function main() {
+  const options = {
+    cacheSize: 2^13,
+    downloadTimeout: null // Disable timeout
+  };
+  
+  const backing = new TranslatorBacking(options);
+
+  let pending = 0; // Number of pending requests
+
+  // Patch the fetch() function to track number of pending requests
+  backing.fetch = async function(...args) {
+    try {
+      $('.app').classList.toggle('loading', ++pending > 0);
+      return await TranslatorBacking.prototype.fetch.call(backing, ...args);
+    } finally {
+      $('.app').classList.toggle('loading', --pending > 0);
+    }
+  };
+
+  // Wait for the language model registry to load. Once it is loaded, use
+  // it to fill the "from" and "to" language selection dropdowns.
+  await backing.registry.then(models => {
+    const names = new Intl.DisplayNames(['en'], {type: 'language'});
+
+    ['from', 'to'].forEach(field => {
+      const languages = new Set(models.map(model => model[field]));
+      const select = $(`#lang-${field}`);
+
+      const pairs = Array.from(languages, code => ({code, name: names.of(code)}));
+      
+      pairs.sort(({name: a}, {name: b}) => a.localeCompare(b));
+
+      pairs.forEach(({name, code}) => {
+        select.add(new Option(name, code));
+      })
+    });
+
+    $('#lang-from').value = 'en';
+    $('#lang-to').value = 'es';
+  });
+
+  // Intentionally do this after querying backing.registry to make sure that
+  // that request is fired off first. Now we can start thinking about loading
+  // the WASM binary etc.
+  const translator = new LatencyOptimisedTranslator(options, backing);
+
+  let abortController = new AbortController();
+
+  const translate = async () => {
+    try {
+      const from = $('#lang-from').value;
+      const to = $('#lang-to').value;
+      
+      // Querying models to see whether quality estimation is supported by all
+      // of them.
+      const models = await backing.getModels({from, to});
+      const qualityScores = models.every(model => 'qualityModel' in model.files);
+
+      $('.app').classList.add('translating');
+
+      const response = await translator.translate({
+        from,
+        to,
+        text: $('#input').innerHTML,
+        html: true,
+        qualityScores
+      }, {signal: abortController.signal});
+
+      $('#output').innerHTML = response.target.text;
+      $('#output').classList.toggle('has-quality-scores', qualityScores);
+
+      if (qualityScores)
+        addQualityIndicators();
+
+    } catch (error) {
+      // Ignore errors caused by changing the language pair (which triggers abort())
+      if (error.constructor === CancelledError) {
+        return;
+      }
+      
+      // Ignore 'errors' caused by typing too fast or by changing the language
+      // pair while a translation was still in progress (or being loaded)
+      if (error.constructor === SupersededError || error.constructor === CancelledError)
+        return;
+
+      // Ignore errors caused by selecting a bad pair (e.g. en -> en)
+      if (error.message.startsWith('No model available to translate from'))
+        return;
+
+      alert(`Error during translation: ${error}\n\n${error.stack}`);
+    } finally {
+      const worker = await Promise.race([translator.worker, Promise.resolve(null)]);
+      $('.app').classList.toggle('translating', worker === null || !worker.idle);
+    }
+  }
+
+  const reset = async () => {
+    // Cancel any pending loading/translation
+    abortController.abort();
+
+    // Reset abort controller to a fresh un-aborted one
+    abortController = new AbortController();
+
+    // Clear output to make it more clear something is happening
+    $('#output').innerHTML = '';
+
+    // Immediately start loading the new selection
+    translate();
+  }
+
+  $('button.swap').addEventListener('click', () => {
+    const tmp = $('#lang-from').value;
+    $('#lang-from').value = $('#lang-to').value;
+    $('#lang-to').value = tmp;
+    translate();
+  })
+
+  // Simple WYSIWYG controls
+  const editor = new Editor($('#input'));
+
+  // Translate on any change
+  $('#input').addEventListener('input', translate);
+  $('#lang-from').addEventListener('input', reset);
+  $('#lang-to').addEventListener('input', reset);
+
+  // Hook up sentence boundary highlighting if that information is available.
+  $('#output').addEventListener('mouseover', (e) => highlightSentence(e.target))
+
+  // Wait for bergamot-translator to load. This could throw a CompileError
+  // which we want to catch so we can show "oh noes browser not supported!"
+  translator.worker.catch(error => {
+    // Catch CompileErrors because for those we know what to do.
+    if (error.name === 'CompileError')
+      $('#unsupported-browser').hidden = false;
+    else
+      throw error;
+  });
+}
+
+main();
diff --git a/inference/wasm/test_page/logos.png b/inference/wasm/test_page/logos.png
new file mode 100644
index 000000000..7646f3ca2
Binary files /dev/null and b/inference/wasm/test_page/logos.png differ
diff --git a/inference/wasm/test_page/package-lock.json b/inference/wasm/test_page/package-lock.json
new file mode 100644
index 000000000..22d229647
--- /dev/null
+++ b/inference/wasm/test_page/package-lock.json
@@ -0,0 +1,1076 @@
+{
+  "name": "test_page",
+  "lockfileVersion": 2,
+  "requires": true,
+  "packages": {
+    "": {
+      "dependencies": {
+        "@browsermt/bergamot-translator": "file:../module",
+        "cors": "^2.8.5",
+        "express": "^4.18.2",
+        "nocache": "^2.1.0"
+      }
+    },
+    "../module": {
+      "name": "@browsermt/bergamot-translator",
+      "version": "0.4.8",
+      "license": "MPL-2.0"
+    },
+    "node_modules/@browsermt/bergamot-translator": {
+      "resolved": "../module",
+      "link": true
+    },
+    "node_modules/accepts": {
+      "version": "1.3.8",
+      "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz",
+      "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==",
+      "dependencies": {
+        "mime-types": "~2.1.34",
+        "negotiator": "0.6.3"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/array-flatten": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
+      "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
+    },
+    "node_modules/body-parser": {
+      "version": "1.20.1",
+      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz",
+      "integrity": "sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==",
+      "dependencies": {
+        "bytes": "3.1.2",
+        "content-type": "~1.0.4",
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "destroy": "1.2.0",
+        "http-errors": "2.0.0",
+        "iconv-lite": "0.4.24",
+        "on-finished": "2.4.1",
+        "qs": "6.11.0",
+        "raw-body": "2.5.1",
+        "type-is": "~1.6.18",
+        "unpipe": "1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8",
+        "npm": "1.2.8000 || >= 1.4.16"
+      }
+    },
+    "node_modules/bytes": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
+      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/call-bind": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.2.tgz",
+      "integrity": "sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA==",
+      "dependencies": {
+        "function-bind": "^1.1.1",
+        "get-intrinsic": "^1.0.2"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/content-disposition": {
+      "version": "0.5.4",
+      "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz",
+      "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==",
+      "dependencies": {
+        "safe-buffer": "5.2.1"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/content-type": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
+      "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/cookie": {
+      "version": "0.5.0",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz",
+      "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/cookie-signature": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
+      "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
+    },
+    "node_modules/cors": {
+      "version": "2.8.5",
+      "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.5.tgz",
+      "integrity": "sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==",
+      "dependencies": {
+        "object-assign": "^4",
+        "vary": "^1"
+      },
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
+    "node_modules/debug": {
+      "version": "2.6.9",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+      "dependencies": {
+        "ms": "2.0.0"
+      }
+    },
+    "node_modules/depd": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
+      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/destroy": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz",
+      "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==",
+      "engines": {
+        "node": ">= 0.8",
+        "npm": "1.2.8000 || >= 1.4.16"
+      }
+    },
+    "node_modules/ee-first": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
+      "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="
+    },
+    "node_modules/encodeurl": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/escape-html": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
+      "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="
+    },
+    "node_modules/etag": {
+      "version": "1.8.1",
+      "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
+      "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/express": {
+      "version": "4.18.2",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz",
+      "integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==",
+      "dependencies": {
+        "accepts": "~1.3.8",
+        "array-flatten": "1.1.1",
+        "body-parser": "1.20.1",
+        "content-disposition": "0.5.4",
+        "content-type": "~1.0.4",
+        "cookie": "0.5.0",
+        "cookie-signature": "1.0.6",
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "finalhandler": "1.2.0",
+        "fresh": "0.5.2",
+        "http-errors": "2.0.0",
+        "merge-descriptors": "1.0.1",
+        "methods": "~1.1.2",
+        "on-finished": "2.4.1",
+        "parseurl": "~1.3.3",
+        "path-to-regexp": "0.1.7",
+        "proxy-addr": "~2.0.7",
+        "qs": "6.11.0",
+        "range-parser": "~1.2.1",
+        "safe-buffer": "5.2.1",
+        "send": "0.18.0",
+        "serve-static": "1.15.0",
+        "setprototypeof": "1.2.0",
+        "statuses": "2.0.1",
+        "type-is": "~1.6.18",
+        "utils-merge": "1.0.1",
+        "vary": "~1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.10.0"
+      }
+    },
+    "node_modules/finalhandler": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz",
+      "integrity": "sha512-5uXcUVftlQMFnWC9qu/svkWv3GTd2PfUhK/3PLkYNAe7FbqJMt3515HaxE6eRL74GdsriiwujiawdaB1BpEISg==",
+      "dependencies": {
+        "debug": "2.6.9",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "on-finished": "2.4.1",
+        "parseurl": "~1.3.3",
+        "statuses": "2.0.1",
+        "unpipe": "~1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/forwarded": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
+      "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/fresh": {
+      "version": "0.5.2",
+      "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
+      "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/function-bind": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz",
+      "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A=="
+    },
+    "node_modules/get-intrinsic": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.3.tgz",
+      "integrity": "sha512-QJVz1Tj7MS099PevUG5jvnt9tSkXN8K14dxQlikJuPt4uD9hHAHjLyLBiLR5zELelBdD9QNRAXZzsJx0WaDL9A==",
+      "dependencies": {
+        "function-bind": "^1.1.1",
+        "has": "^1.0.3",
+        "has-symbols": "^1.0.3"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/has": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz",
+      "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==",
+      "dependencies": {
+        "function-bind": "^1.1.1"
+      },
+      "engines": {
+        "node": ">= 0.4.0"
+      }
+    },
+    "node_modules/has-symbols": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz",
+      "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/http-errors": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
+      "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==",
+      "dependencies": {
+        "depd": "2.0.0",
+        "inherits": "2.0.4",
+        "setprototypeof": "1.2.0",
+        "statuses": "2.0.1",
+        "toidentifier": "1.0.1"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/iconv-lite": {
+      "version": "0.4.24",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
+      "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
+      "dependencies": {
+        "safer-buffer": ">= 2.1.2 < 3"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
+    },
+    "node_modules/ipaddr.js": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
+      "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==",
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
+    "node_modules/media-typer": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
+      "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/merge-descriptors": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
+      "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
+    },
+    "node_modules/methods": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
+      "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mime": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
+      "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
+      "bin": {
+        "mime": "cli.js"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "dependencies": {
+        "mime-db": "1.52.0"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/ms": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+      "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
+    },
+    "node_modules/negotiator": {
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz",
+      "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/nocache": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/nocache/-/nocache-2.1.0.tgz",
+      "integrity": "sha512-0L9FvHG3nfnnmaEQPjT9xhfN4ISk0A8/2j4M37Np4mcDesJjHgEUfgPhdCyZuFI954tjokaIj/A3NdpFNdEh4Q==",
+      "engines": {
+        "node": ">=4.0.0"
+      }
+    },
+    "node_modules/object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-inspect": {
+      "version": "1.12.3",
+      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.3.tgz",
+      "integrity": "sha512-geUvdk7c+eizMNUDkRpW1wJwgfOiOeHbxBR/hLXK1aT6zmVSO0jsQcs7fj6MGw89jC/cjGfLcNOrtMYtGqm81g==",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/on-finished": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz",
+      "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==",
+      "dependencies": {
+        "ee-first": "1.1.1"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/parseurl": {
+      "version": "1.3.3",
+      "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
+      "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/path-to-regexp": {
+      "version": "0.1.7",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
+      "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
+    },
+    "node_modules/proxy-addr": {
+      "version": "2.0.7",
+      "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz",
+      "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==",
+      "dependencies": {
+        "forwarded": "0.2.0",
+        "ipaddr.js": "1.9.1"
+      },
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
+    "node_modules/qs": {
+      "version": "6.11.0",
+      "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
+      "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
+      "dependencies": {
+        "side-channel": "^1.0.4"
+      },
+      "engines": {
+        "node": ">=0.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/range-parser": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
+      "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/raw-body": {
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz",
+      "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==",
+      "dependencies": {
+        "bytes": "3.1.2",
+        "http-errors": "2.0.0",
+        "iconv-lite": "0.4.24",
+        "unpipe": "1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/safe-buffer": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ]
+    },
+    "node_modules/safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
+    },
+    "node_modules/send": {
+      "version": "0.18.0",
+      "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
+      "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==",
+      "dependencies": {
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "destroy": "1.2.0",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "fresh": "0.5.2",
+        "http-errors": "2.0.0",
+        "mime": "1.6.0",
+        "ms": "2.1.3",
+        "on-finished": "2.4.1",
+        "range-parser": "~1.2.1",
+        "statuses": "2.0.1"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/send/node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
+    },
+    "node_modules/serve-static": {
+      "version": "1.15.0",
+      "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
+      "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==",
+      "dependencies": {
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "parseurl": "~1.3.3",
+        "send": "0.18.0"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/setprototypeof": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz",
+      "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="
+    },
+    "node_modules/side-channel": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz",
+      "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==",
+      "dependencies": {
+        "call-bind": "^1.0.0",
+        "get-intrinsic": "^1.0.2",
+        "object-inspect": "^1.9.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/statuses": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
+      "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/toidentifier": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz",
+      "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==",
+      "engines": {
+        "node": ">=0.6"
+      }
+    },
+    "node_modules/type-is": {
+      "version": "1.6.18",
+      "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
+      "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
+      "dependencies": {
+        "media-typer": "0.3.0",
+        "mime-types": "~2.1.24"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/unpipe": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
+      "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/utils-merge": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
+      "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=",
+      "engines": {
+        "node": ">= 0.4.0"
+      }
+    },
+    "node_modules/vary": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
+      "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    }
+  },
+  "dependencies": {
+    "@browsermt/bergamot-translator": {
+      "version": "file:../module"
+    },
+    "accepts": {
+      "version": "1.3.8",
+      "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz",
+      "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==",
+      "requires": {
+        "mime-types": "~2.1.34",
+        "negotiator": "0.6.3"
+      }
+    },
+    "array-flatten": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
+      "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
+    },
+    "body-parser": {
+      "version": "1.20.1",
+      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz",
+      "integrity": "sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==",
+      "requires": {
+        "bytes": "3.1.2",
+        "content-type": "~1.0.4",
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "destroy": "1.2.0",
+        "http-errors": "2.0.0",
+        "iconv-lite": "0.4.24",
+        "on-finished": "2.4.1",
+        "qs": "6.11.0",
+        "raw-body": "2.5.1",
+        "type-is": "~1.6.18",
+        "unpipe": "1.0.0"
+      }
+    },
+    "bytes": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
+      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="
+    },
+    "call-bind": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.2.tgz",
+      "integrity": "sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA==",
+      "requires": {
+        "function-bind": "^1.1.1",
+        "get-intrinsic": "^1.0.2"
+      }
+    },
+    "content-disposition": {
+      "version": "0.5.4",
+      "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz",
+      "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==",
+      "requires": {
+        "safe-buffer": "5.2.1"
+      }
+    },
+    "content-type": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
+      "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
+    },
+    "cookie": {
+      "version": "0.5.0",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz",
+      "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw=="
+    },
+    "cookie-signature": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
+      "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
+    },
+    "cors": {
+      "version": "2.8.5",
+      "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.5.tgz",
+      "integrity": "sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==",
+      "requires": {
+        "object-assign": "^4",
+        "vary": "^1"
+      }
+    },
+    "debug": {
+      "version": "2.6.9",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+      "requires": {
+        "ms": "2.0.0"
+      }
+    },
+    "depd": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
+      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="
+    },
+    "destroy": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz",
+      "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg=="
+    },
+    "ee-first": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
+      "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="
+    },
+    "encodeurl": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w=="
+    },
+    "escape-html": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
+      "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="
+    },
+    "etag": {
+      "version": "1.8.1",
+      "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
+      "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="
+    },
+    "express": {
+      "version": "4.18.2",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz",
+      "integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==",
+      "requires": {
+        "accepts": "~1.3.8",
+        "array-flatten": "1.1.1",
+        "body-parser": "1.20.1",
+        "content-disposition": "0.5.4",
+        "content-type": "~1.0.4",
+        "cookie": "0.5.0",
+        "cookie-signature": "1.0.6",
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "finalhandler": "1.2.0",
+        "fresh": "0.5.2",
+        "http-errors": "2.0.0",
+        "merge-descriptors": "1.0.1",
+        "methods": "~1.1.2",
+        "on-finished": "2.4.1",
+        "parseurl": "~1.3.3",
+        "path-to-regexp": "0.1.7",
+        "proxy-addr": "~2.0.7",
+        "qs": "6.11.0",
+        "range-parser": "~1.2.1",
+        "safe-buffer": "5.2.1",
+        "send": "0.18.0",
+        "serve-static": "1.15.0",
+        "setprototypeof": "1.2.0",
+        "statuses": "2.0.1",
+        "type-is": "~1.6.18",
+        "utils-merge": "1.0.1",
+        "vary": "~1.1.2"
+      }
+    },
+    "finalhandler": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz",
+      "integrity": "sha512-5uXcUVftlQMFnWC9qu/svkWv3GTd2PfUhK/3PLkYNAe7FbqJMt3515HaxE6eRL74GdsriiwujiawdaB1BpEISg==",
+      "requires": {
+        "debug": "2.6.9",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "on-finished": "2.4.1",
+        "parseurl": "~1.3.3",
+        "statuses": "2.0.1",
+        "unpipe": "~1.0.0"
+      }
+    },
+    "forwarded": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
+      "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="
+    },
+    "fresh": {
+      "version": "0.5.2",
+      "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
+      "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q=="
+    },
+    "function-bind": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz",
+      "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A=="
+    },
+    "get-intrinsic": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.3.tgz",
+      "integrity": "sha512-QJVz1Tj7MS099PevUG5jvnt9tSkXN8K14dxQlikJuPt4uD9hHAHjLyLBiLR5zELelBdD9QNRAXZzsJx0WaDL9A==",
+      "requires": {
+        "function-bind": "^1.1.1",
+        "has": "^1.0.3",
+        "has-symbols": "^1.0.3"
+      }
+    },
+    "has": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz",
+      "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==",
+      "requires": {
+        "function-bind": "^1.1.1"
+      }
+    },
+    "has-symbols": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz",
+      "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A=="
+    },
+    "http-errors": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
+      "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==",
+      "requires": {
+        "depd": "2.0.0",
+        "inherits": "2.0.4",
+        "setprototypeof": "1.2.0",
+        "statuses": "2.0.1",
+        "toidentifier": "1.0.1"
+      }
+    },
+    "iconv-lite": {
+      "version": "0.4.24",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
+      "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
+      "requires": {
+        "safer-buffer": ">= 2.1.2 < 3"
+      }
+    },
+    "inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
+    },
+    "ipaddr.js": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
+      "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="
+    },
+    "media-typer": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
+      "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ=="
+    },
+    "merge-descriptors": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
+      "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
+    },
+    "methods": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
+      "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
+    },
+    "mime": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
+      "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="
+    },
+    "mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="
+    },
+    "mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "requires": {
+        "mime-db": "1.52.0"
+      }
+    },
+    "ms": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+      "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
+    },
+    "negotiator": {
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz",
+      "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg=="
+    },
+    "nocache": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/nocache/-/nocache-2.1.0.tgz",
+      "integrity": "sha512-0L9FvHG3nfnnmaEQPjT9xhfN4ISk0A8/2j4M37Np4mcDesJjHgEUfgPhdCyZuFI954tjokaIj/A3NdpFNdEh4Q=="
+    },
+    "object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM="
+    },
+    "object-inspect": {
+      "version": "1.12.3",
+      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.3.tgz",
+      "integrity": "sha512-geUvdk7c+eizMNUDkRpW1wJwgfOiOeHbxBR/hLXK1aT6zmVSO0jsQcs7fj6MGw89jC/cjGfLcNOrtMYtGqm81g=="
+    },
+    "on-finished": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz",
+      "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==",
+      "requires": {
+        "ee-first": "1.1.1"
+      }
+    },
+    "parseurl": {
+      "version": "1.3.3",
+      "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
+      "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="
+    },
+    "path-to-regexp": {
+      "version": "0.1.7",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
+      "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
+    },
+    "proxy-addr": {
+      "version": "2.0.7",
+      "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz",
+      "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==",
+      "requires": {
+        "forwarded": "0.2.0",
+        "ipaddr.js": "1.9.1"
+      }
+    },
+    "qs": {
+      "version": "6.11.0",
+      "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
+      "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
+      "requires": {
+        "side-channel": "^1.0.4"
+      }
+    },
+    "range-parser": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
+      "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="
+    },
+    "raw-body": {
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz",
+      "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==",
+      "requires": {
+        "bytes": "3.1.2",
+        "http-errors": "2.0.0",
+        "iconv-lite": "0.4.24",
+        "unpipe": "1.0.0"
+      }
+    },
+    "safe-buffer": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="
+    },
+    "safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
+    },
+    "send": {
+      "version": "0.18.0",
+      "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
+      "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==",
+      "requires": {
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "destroy": "1.2.0",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "fresh": "0.5.2",
+        "http-errors": "2.0.0",
+        "mime": "1.6.0",
+        "ms": "2.1.3",
+        "on-finished": "2.4.1",
+        "range-parser": "~1.2.1",
+        "statuses": "2.0.1"
+      },
+      "dependencies": {
+        "ms": {
+          "version": "2.1.3",
+          "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+          "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
+        }
+      }
+    },
+    "serve-static": {
+      "version": "1.15.0",
+      "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
+      "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==",
+      "requires": {
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "parseurl": "~1.3.3",
+        "send": "0.18.0"
+      }
+    },
+    "setprototypeof": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz",
+      "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="
+    },
+    "side-channel": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz",
+      "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==",
+      "requires": {
+        "call-bind": "^1.0.0",
+        "get-intrinsic": "^1.0.2",
+        "object-inspect": "^1.9.0"
+      }
+    },
+    "statuses": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
+      "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ=="
+    },
+    "toidentifier": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz",
+      "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA=="
+    },
+    "type-is": {
+      "version": "1.6.18",
+      "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
+      "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
+      "requires": {
+        "media-typer": "0.3.0",
+        "mime-types": "~2.1.24"
+      }
+    },
+    "unpipe": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
+      "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="
+    },
+    "utils-merge": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
+      "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
+    },
+    "vary": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
+      "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
+    }
+  }
+}
diff --git a/inference/wasm/test_page/package.json b/inference/wasm/test_page/package.json
new file mode 100644
index 000000000..622b48c1a
--- /dev/null
+++ b/inference/wasm/test_page/package.json
@@ -0,0 +1,14 @@
+{
+  "dependencies": {
+    "@browsermt/bergamot-translator": "file:../module",
+    "cors": "^2.8.5",
+    "express": "^4.18.2",
+    "nocache": "^2.1.0"
+  },
+  "config": {
+    "port": 80
+  },
+  "scripts": {
+    "start": "node ./bergamot-httpserver.js $npm_package_config_port 1 0"
+  }
+}
diff --git a/inference/wasm/test_page/start_server.sh b/inference/wasm/test_page/start_server.sh
new file mode 100644
index 000000000..5b6eeb0a3
--- /dev/null
+++ b/inference/wasm/test_page/start_server.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+usage="Copy wasm artifacts from the given folder and start httpserver
+
+Usage: $(basename "$0") [ARTIFACTS_SOURCE_FOLDER]
+
+    where:
+    ARTIFACTS_SOURCE_FOLDER    Directory containing pre-built wasm artifacts"
+
+SCRIPT_ABSOLUTE_PATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+
+if [ "$#" -ne 1 ]; then
+    echo "Illegal number of parameters passed"
+    echo "$usage"
+    exit
+fi
+
+# Check if ARTIFACTS_SOURCE_FOLDER is valid or not
+if [ ! -e "$1" ]; then
+    echo "Error: Folder \""$1"\" doesn't exist"
+    exit
+fi
+
+# Prepare a list all wasm artifacts to be copied and copy them to the destination folder
+ARTIFACTS_BASE_NAME="bergamot-translator-worker"
+ARTIFACTS="$1/$ARTIFACTS_BASE_NAME.js $1/$ARTIFACTS_BASE_NAME.wasm"
+ARTIFACTS_DESTINATION_FOLDER=$SCRIPT_ABSOLUTE_PATH/../module/worker
+
+for i in $ARTIFACTS; do
+    [ -f "$i" ] || breaks
+    cp $i $ARTIFACTS_DESTINATION_FOLDER
+    echo "Copied \"$i\" to \"$ARTIFACTS_DESTINATION_FOLDER\""
+done
+
+# Start http server
+(cd $SCRIPT_ABSOLUTE_PATH;
+npm install;
+echo "Start httpserver";
+node bergamot-httpserver.js 80 1 0)