omnibus actual concurrency and major refactor (#1530)

* add concurrency to config * more descriptive names for predict functions * don't cancel from signal handler if a loop is running. expose worker busy state to runner * move handle_event_stream to PredictionEventHandler * make setup and canceling work * keep track of multiple runner prediction tasks to make idempotent endpoint return the same result and fix tests somewhat * drop Runner._result, comments * move create_event_handler into PredictionEventHandler.__init__ * break out Path.validate into value_to_path and inline get_filename and File.validate * split out URLPath into BackwardsCompatibleDataURLTempFilePath and URLThatCanBeConvertedToPath with the download part of URLFile inlined * let's make DataURLTempFilePath also use convert and move value_to_path back to Path.validate * drop should_cancel * prediction->request * split up predict/inner/prediction_ctx into enter_predict/exit_predict/prediction_ctx/inner_async_predict/predict/good_predict as one way to do it. however, exposing all of those for runner predict enter/coro exit still sucks, but this is still an improvement * bigish change: inline predict_and_handle_errors * inline make_error_handler into setup * move runner.setup into runner.Runner.setup * add concurrency to config in go * try explicitly using prediction_ctx __enter__ and __exit__ * relax setup argument requirement to str * glom worker into runner * add logging message * fix prediction retry and improve logging * split out handle_event * use CURL_CA_BUNDLE for file upload * dubious upload fix * skip worker and webhook tests since those were erroring on removed imports. fix or xfail runner tests * validate prediction response to raise errors, but return the unvalidated output to avoid converting urls to File/Path * expose concurrency in healthcheck * mediocre logging that works like print * COG_DISABLE_CANCEL to ignore cancelations * COG_CONCURRENCY_OVERRIDE * add ready probe as an http route * encode webhooks only after knowing they will be sent, and bail our of upload type checks early for strs * don't validate outputs * add AsyncConcatenateIterator * should_exit is not actually used by http * format * codecov * describe the remaining problems with this PR and add comments about cancelation and validation * add a test * fix test (#1669) * fix config schema * allow setting both max and target concurrency in cog.yaml (#1672) * drop default_target (#1685) --------- Signed-off-by: technillogue <technillogue@gmail.com> Co-authored-by: Mattt <mattt@replicate.com> Signed-off-by: technillogue <technillogue@gmail.com>
replicate · Jun 19, 2024 · 542ca6b · 542ca6b
1 parent 9930755
commit 542ca6b
Show file tree

Hide file tree

Showing 14 changed files with 486 additions and 362 deletions.
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -56,16 +56,21 @@ type Build struct {
 	pythonRequirementsContent []string
 }
 
+type Concurrency struct {
+	Max int `json:"max,omitempty" yaml:"max"`
+}
+
 type Example struct {
 	Input  map[string]string `json:"input" yaml:"input"`
 	Output string            `json:"output" yaml:"output"`
 }
 
 type Config struct {
-	Build   *Build `json:"build" yaml:"build"`
-	Image   string `json:"image,omitempty" yaml:"image"`
-	Predict string `json:"predict,omitempty" yaml:"predict"`
-	Train   string `json:"train,omitempty" yaml:"train"`
+	Build       *Build       `json:"build" yaml:"build"`
+	Image       string       `json:"image,omitempty" yaml:"image"`
+	Predict     string       `json:"predict,omitempty" yaml:"predict"`
+	Train       string       `json:"train,omitempty" yaml:"train"`
+	Concurrency *Concurrency `json:"concurrency,omitempty" yaml:"concurrency"`
 }
 
 func DefaultConfig() *Config {

diff --git a/pkg/config/data/config_schema_v1.0.json b/pkg/config/data/config_schema_v1.0.json
@@ -154,11 +154,6 @@
           "$id": "#/properties/concurrency/properties/max",
           "type": "integer",
           "description": "The maximum number of concurrent predictions."
-        },
-        "default_target": {
-          "$id": "#/properties/concurrency/properties/default_target",
-          "type": "integer",
-          "description": "The default target for number of concurrent predictions. This setting can be used by an autoscaler to determine when to scale a deployment of a model up or down."
         }
       }
     }

diff --git a/python/cog/command/ast_openapi_schema.py b/python/cog/command/ast_openapi_schema.py
@@ -147,6 +147,24 @@
         "summary": "Healthcheck"
       }
     },
+    "/ready": {
+      "get": {
+        "summary": "Ready",
+        "operationId": "ready_ready_get",
+        "responses": {
+          "200": {
+            "description": "Successful Response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "title": "Response Ready Ready Get"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/predictions": {
       "post": {
         "description": "Run a single prediction on the model",

diff --git a/python/cog/predictor.py b/python/cog/predictor.py
@@ -63,6 +63,13 @@ def predict(self, **kwargs: Any) -> Any:
         """
         pass
 
+    def log(self, *messages: str) -> None:
+        """
+        Write a log message that will be tagged with the current prediction
+        even during concurrent predictions. At runtime this method is overriden.
+        """
+        print(*messages)
+
 
 def run_setup(predictor: BasePredictor) -> None:
     weights = get_weights_argument(predictor)

diff --git a/python/cog/server/eventtypes.py b/python/cog/server/eventtypes.py
@@ -20,6 +20,11 @@ def from_request(cls, request: schema.PredictionRequest) -> "PredictionInput":
         return cls(payload=payload, id=request.id)
 
 
+@define
+class Cancel:
+    id: str
+
+
 @define
 class Shutdown:
     pass

diff --git a/python/cog/server/http.py b/python/cog/server/http.py
@@ -27,11 +27,10 @@
 import attrs
 import structlog
 import uvicorn
-from fastapi import Body, FastAPI, Header, HTTPException, Path, Response
+from fastapi import Body, FastAPI, Header, Path, Response
 from fastapi.encoders import jsonable_encoder
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
-from pydantic import ValidationError
 from pydantic.error_wrappers import ErrorWrapper
 
 from .. import schema
@@ -133,10 +132,13 @@ async def start_shutdown() -> Any:
         add_setup_failed_routes(app, started_at, msg)
         return app
 
+    concurrency = config.get("concurrency", {}).get("max", "1")
+
     runner = PredictionRunner(
         predictor_ref=predictor_ref,
         shutdown_event=shutdown_event,
         upload_url=upload_url,
+        concurrency=int(concurrency),
     )
 
     class PredictionRequest(schema.PredictionRequest.with_types(input_type=InputType)):
@@ -261,7 +263,22 @@ async def healthcheck() -> Any:
         else:
             health = app.state.health
         setup = attrs.asdict(app.state.setup_result) if app.state.setup_result else {}
-        return jsonable_encoder({"status": health.name, "setup": setup})
+        activity = runner.activity_info()
+        return jsonable_encoder(
+            {"status": health.name, "setup": setup, "concurrency": activity}
+        )
+
+    # this is a readiness probe, it only returns 200 when work can be accepted
+    @app.get("/ready")
+    async def ready() -> Any:
+        activity = runner.activity_info()
+        if runner.is_busy():
+            return JSONResponse(
+                {"status": "ready", "activity": activity}, status_code=200
+            )
+        return JSONResponse(
+            {"status": "not ready", "activity": activity}, status_code=503
+        )
 
     @limited
     @app.post(
@@ -348,27 +365,23 @@ async def shared_predict(
         if respond_async:
             return JSONResponse(jsonable_encoder(initial_response), status_code=202)
 
-        # by now, output Path and File are already converted to str
-        # so when we validate the schema, those urls get cast back to Path and File
-        # in the previous implementation those would then get encoded as strings
-        # however the changes to Path and File break this and return the filename instead
-        try:
-            prediction = await async_result
-            # we're only doing this to catch validation errors
-            response = PredictionResponse(**prediction.dict())
-            del response
-        except ValidationError as e:
-            _log_invalid_output(e)
-            raise HTTPException(status_code=500, detail=str(e)) from e
-
-        # dict_resp = response.dict()
-        # output = await runner.client_manager.upload_files(
-        #     dict_resp["output"], upload_url
-        # )
-        # dict_resp["output"] = output
-        # encoded_response = jsonable_encoder(dict_resp)
-
-        # return *prediction* and not *response* to preserve urls
+        # # by now, output Path and File are already converted to str
+        # # so when we validate the schema, those urls get cast back to Path and File
+        # # in the previous implementation those would then get encoded as strings
+        # # however the changes to Path and File break this and return the filename instead
+        #
+        # # moreover, validating outputs can be a bottleneck with enough volume
+        # # since it's not strictly needed, we can comment it out
+        # try:
+        #     prediction = await async_result
+        #     # we're only doing this to catch validation errors
+        #     response = PredictionResponse(**prediction.dict())
+        #     del response
+        # except ValidationError as e:
+        #     _log_invalid_output(e)
+        #     raise HTTPException(status_code=500, detail=str(e)) from e
+
+        prediction = await async_result
         encoded_response = jsonable_encoder(prediction.dict())
         return JSONResponse(content=encoded_response)
 
@@ -377,8 +390,7 @@ async def cancel(prediction_id: str = Path(..., title="Prediction ID")) -> Any:
         """
         Cancel a running prediction
         """
-        if not runner.is_busy():
-            return JSONResponse({}, status_code=404)
+        # no need to check whether or not we're busy
         try:
             runner.cancel(prediction_id)
         except UnknownPredictionError:
@@ -433,7 +445,6 @@ def start(self) -> None:
 
     def stop(self) -> None:
         log.info("stopping server")
-        self.should_exit = True
 
         self._thread.join(timeout=5)
         if not self._thread.is_alive():