deepjavalibrary · frankfliu · Mar 29, 2023 · Mar 21, 2023
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+package ai.djl.serving.cache;
+
+import ai.djl.modality.Output;
+
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+
+/** A class that manages response cache. */
+public class CacheManager {
+
+    private static CacheManager instance = new CacheManager();
+
+    private Map<String, Output> cache = new ConcurrentHashMap<>();
+
+    protected CacheManager() {}
+
+    /**
+     * Returns the registered {@code CacheManager} instance.
+     *
+     * @return the registered {@code CacheManager} instance
+     */
+    public static CacheManager getInstance() {
+        return instance;
+    }
+
+    /**
+     * Sets the {@code CacheManager} instance.
+     *
+     * @param instance the {@code CacheManager} instance
+     */
+    public static void setCacheManager(CacheManager instance) {
+        CacheManager.instance = instance;
+    }
+
+    /**
+     * Adds the {@code Output} to cache and return the cache key.
+     *
+     * @param output the {@code Output} to be added in cache
+     * @return the cache key
+     */
+    public String put(Output output) {
+        String key = UUID.randomUUID().toString();
+        cache.put(key, output);
+        return key;
+    }
+
+    /**
+     * Returns the cached {@code Output} object witch the specified key.
+     *
+     * @param key the cache key
+     * @return the cached {@code Output} object
+     */
+    public Output get(String key) {
+        return cache.get(key);
+    }
+
+    /**
+     * Removes the cache from the key.
+     *
+     * @param key the cache key
+     */
+    public void remove(String key) {
+        cache.remove(key);
+    }
+}
@@ -0,0 +1,14 @@
+/*
+ * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+/** Contains classes that handles response caching. */
+package ai.djl.serving.cache;
@@ -19,6 +19,7 @@
 import ai.djl.modality.Output;
 import ai.djl.ndarray.BytesSupplier;
 import ai.djl.repository.zoo.ModelNotFoundException;
+import ai.djl.serving.cache.CacheManager;
 import ai.djl.serving.models.ModelManager;
 import ai.djl.serving.util.ConfigManager;
 import ai.djl.serving.util.NettyUtils;
@@ -45,8 +46,11 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.TimeUnit;
 import java.util.regex.Pattern;
 
@@ -63,6 +67,11 @@ public class InferenceRequestHandler extends HttpRequestHandler {
     private static final Pattern PATTERN =
             Pattern.compile("/(ping|invocations|predictions)([/?].*)?|/models/.+/invoke");
 
+    private static final String X_SYNCHRONOUS = "x-synchronous";
+    private static final String X_STARTING_TOKEN = "x-starting-token";
+    private static final String X_NEXT_TOKEN = "x-next-token";
+    private static final String X_MAX_ITEMS = "x-max-items";
+
     private RequestParser requestParser;
 
     /** default constructor. */
@@ -114,7 +123,7 @@ protected void handleRequest(
                                 });
                 break;
             case "invocations":
-                handleInvocations(ctx, req, decoder);
+                handleInvocations(ctx, req, decoder, null);
                 break;
             case "models":
                 handleInvocations(ctx, req, decoder, segments[2]);
@@ -147,12 +156,6 @@ private void handlePredictions(
         predict(ctx, req, input, modelName, version);
     }
 
-    private void handleInvocations(
-            ChannelHandlerContext ctx, FullHttpRequest req, QueryStringDecoder decoder)
-            throws ModelNotFoundException {
-        handleInvocations(ctx, req, decoder, null);
-    }
-
     private void handleInvocations(
             ChannelHandlerContext ctx,
             FullHttpRequest req,
@@ -192,6 +195,17 @@ private void predict(
             String workflowName,
             String version)
             throws ModelNotFoundException {
+        String startingToken = input.getProperty(X_STARTING_TOKEN, null);
+        if (startingToken != null && !HttpMethod.OPTIONS.equals(req.method())) {
+            CompletableFuture.runAsync(() -> getCacheResult(ctx, input, startingToken))
+                    .exceptionally(
+                            t -> {
+                                onException(t.getCause(), ctx);
+                                return null;
+                            });
+            return;
+        }
+
         ModelManager modelManager = ModelManager.getInstance();
         ConfigManager config = ConfigManager.getInstance();
         Workflow workflow = modelManager.getWorkflow(workflowName, version, true);
@@ -251,7 +265,20 @@ void runJob(
                 .whenCompleteAsync(
                         (o, t) -> {
                             if (o != null) {
-                                sendOutput(o, ctx);
+                                String sync = input.getProperty(X_SYNCHRONOUS, "true");
+                                if (Boolean.parseBoolean(sync)) {
+                                    sendOutput(o, ctx);
+                                    return;
+                                }
+
+                                CacheManager cm = CacheManager.getInstance();
+                                String nextToken = cm.put(o);
+                                Output out = new Output();
+                                out.setCode(o.getCode());
+                                out.setMessage(o.getMessage());
+                                out.getProperties().putAll(out.getProperties());
+                                out.addProperty(X_NEXT_TOKEN, nextToken);
+                                sendOutput(out, ctx);
                             }
                         })
                 .exceptionally(
@@ -261,6 +288,55 @@ void runJob(
                         });
     }
 
+    private void getCacheResult(ChannelHandlerContext ctx, Input input, String startingToken) {
+        int limit = Integer.parseInt(input.getProperty(X_MAX_ITEMS, "-1"));
+        if (limit < 0) {
+            limit = Integer.MAX_VALUE;
+        }
+
+        CacheManager cm = CacheManager.getInstance();
+        Output output = cm.get(startingToken);
+        if (output == null) {
+            throw new BadRequestException("Invalid " + X_STARTING_TOKEN);
+        }
+        BytesSupplier data = output.getData();
+        if (!(data instanceof ChunkedBytesSupplier)) {
+            logger.warn("Output doesn't support async response");
+            sendOutput(output, ctx);
+            return;
+        }
+
+        ChunkedBytesSupplier cbs = (ChunkedBytesSupplier) output.getData();
+        List<byte[]> list = new ArrayList<>();
+        int size = 0;
+        for (int i = 0; i < limit; ++i) {
+            byte[] buf = cbs.poll();
+            if (buf == null) {
+                break;
+            }
+            size += buf.length;
+            list.add(buf);
+        }
+        byte[] buf = new byte[size];
+        int pos = 0;
+        for (byte[] array : list) {
+            System.arraycopy(array, 0, buf, pos, array.length);
+            pos += array.length;
+        }
+        Output o = new Output();
+        o.setCode(output.getCode());
+        o.setMessage(output.getMessage());
+        o.getProperties().putAll(output.getProperties());
+        o.add(buf);
+        if (cbs.hasNext()) {
+            o.addProperty(X_NEXT_TOKEN, startingToken);
+        } else {
+            // clean up cache
+            cm.remove(startingToken);
+        }
+        sendOutput(o, ctx);
+    }
+
     void sendOutput(Output output, ChannelHandlerContext ctx) {
         /*
          * We can load the models based on the configuration file. Since this Job is
@@ -322,7 +398,7 @@ void sendOutput(Output output, ChannelHandlerContext ctx) {
 
     void onException(Throwable t, ChannelHandlerContext ctx) {
         HttpResponseStatus status;
-        if (t instanceof TranslateException) {
+        if (t instanceof TranslateException || t instanceof BadRequestException) {
             SERVER_METRIC.info("{}", RESPONSE_4_XX);
             status = HttpResponseStatus.BAD_REQUEST;
         } else if (t instanceof WlmException) {

@@ -303,6 +303,7 @@ public void test()
             testListWorkflows(channel);
             testDescribeModel(channel);
             testUnregisterModel(channel);
+            testAsyncInference(channel);
 
             testPredictionsInvalidRequestSize(channel);
 
@@ -727,6 +728,43 @@ private void testUnregisterModel(Channel channel) throws InterruptedException {
         assertEquals(resp.getStatus(), "Model or workflow \"mlp_1\" unregistered");
     }
 
+    private void testAsyncInference(Channel channel) throws InterruptedException {
+        String url = URLEncoder.encode("file:src/test/resources/echo", StandardCharsets.UTF_8);
+        url = "/models?model_name=echo&url=" + url;
+        request(channel, new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.POST, url));
+        assertEquals(httpStatus.code(), HttpResponseStatus.OK.code());
+
+        // send request
+        url = "/predictions/echo?stream=true";
+        HttpRequest req = new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.POST, url);
+        req.headers().add("x-synchronous", "false");
+        req.headers().add("delay", "1");
+        request(channel, req);
+        assertEquals(httpStatus.code(), HttpResponseStatus.OK.code());
+        String nextToken = headers.get("x-next-token");
+        assertNotNull(nextToken);
+
+        url = "/predictions/echo";
+        req = new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.POST, url);
+        req.headers().add("x-starting-token", nextToken);
+        req.headers().add("x-max-items", "1");
+        request(channel, req);
+        assertEquals(result, "tok_0\n");
+
+        while (headers.contains("x-next-token")) {
+            nextToken = headers.get("x-next-token");
+            req = new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.POST, url);
+            req.headers().add("x-starting-token", nextToken);
+            request(channel, req);
+            assertEquals(httpStatus.code(), HttpResponseStatus.OK.code());
+        }
+
+        // Unregister model
+        url = "/models/echo";
+        request(channel, new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.DELETE, url));
+        assertEquals(httpStatus.code(), HttpResponseStatus.OK.code());
+    }
+
     private void testDescribeApi(Channel channel) throws InterruptedException {
         request(
                 channel,

@@ -0,0 +1,70 @@
+/*
+ * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+package ai.djl.serving.translator;
+
+import ai.djl.modality.ChunkedBytesSupplier;
+import ai.djl.modality.Input;
+import ai.djl.modality.Output;
+import ai.djl.ndarray.NDList;
+import ai.djl.translate.Batchifier;
+import ai.djl.translate.ServingTranslator;
+import ai.djl.translate.TranslatorContext;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+
+public class EchoTranslator implements ServingTranslator {
+
+    @Override
+    public void setArguments(Map<String, ?> arguments) {}
+
+    @Override
+    public Batchifier getBatchifier() {
+        return null;
+    }
+
+    @Override
+    public NDList processInput(TranslatorContext ctx, Input input) {
+        ctx.setAttachment("input", input);
+        return null;
+    }
+
+    @Override
+    public Output processOutput(TranslatorContext ctx, NDList list) {
+        Input input = (Input) ctx.getAttachment("input");
+        boolean streaming = Boolean.parseBoolean(input.getAsString("stream"));
+        long delay = Long.parseLong(input.getProperty("delay", "1000"));
+        Output output = new Output();
+        if (streaming) {
+            ChunkedBytesSupplier cs = new ChunkedBytesSupplier();
+            output.add(cs);
+            new Thread(() -> sendToken(cs, delay)).start();
+        } else {
+            output.setProperties(input.getProperties());
+            output.add(input.getData());
+        }
+        return output;
+    }
+
+    public void sendToken(ChunkedBytesSupplier cs, long delay) {
+        try {
+            for (int i = 0; i < 5; ++i) {
+                cs.appendContent(("tok_" + i + '\n').getBytes(StandardCharsets.UTF_8), false);
+                Thread.sleep(delay);
+            }
+            cs.appendContent(("tok_" + 5 + '\n').getBytes(StandardCharsets.UTF_8), true);
+        } catch (InterruptedException ignore) {
+            // ignore
+        }
+    }
+}
@@ -0,0 +1,4 @@
+engine=PyTorch
+blockFactory=ai.djl.nn.IdentityBlockFactory
+translator=ai.djl.serving.translator.EchoTranslator
+option.hasParameter=false