update docs for 0.5.12/0.5.11 (#36)

star-whale · Sep 7, 2023 · 0d29932 · 0d29932
1 parent 82f5c02
commit 0d29932
Show file tree

Hide file tree

Showing 15 changed files with 498 additions and 67 deletions.
diff --git a/docs/reference/sdk/evaluation.md b/docs/reference/sdk/evaluation.md
@@ -517,6 +517,23 @@ class PipelineHandler(metaclass=ABCMeta):
   - Equivalent to the `log_dataset_features parameter` in `@evaluation.predict`.
   - Default is `None`, which records all features.
 
+### PipelineHandler.run Decorator {#pl-run}
+
+The `PipelineHandler.run` decorator can be used to describe resources for the `predict` and `evaluate` methods, supporting definitions of `replicas` and `resources`:
+
+- The `PipelineHandler.run` decorator can only decorate `predict` and `evaluate` methods in subclasses inheriting from `PipelineHandler`.
+- The `predict` method can set the `replicas` parameter. The `replicas` value for the `evaluate` method is always 1.
+- The `resources` parameter is defined and used in the same way as the `resources` parameter in `@evaluation.predict` or `@evaluation.evaluate`.
+- The `PipelineHandler.run` decorator is optional.
+- The `PipelineHandler.run` decorator only takes effect on Server and Cloud instances, not Standalone instances that don't support resource definition.
+
+```python
+@classmethod
+def run(
+    cls, resources: t.Optional[t.Dict[str, t.Any]] = None, replicas: int = 1
+) -> t.Callable:
+```
+
 ### Examples {#pl-example}
 
 ```python
@@ -531,11 +548,13 @@ class Example(PipelineHandler):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model = self._load_model(self.device)
 
+    @PipelineHandler.run(replicas=4, resources={"memory": 1 * 1024 * 1024 *1024, "nvidia.com/gpu": 1}) # 1G Memory, 1 GPU
     def predict(self, data: t.Dict):
         data_tensor = self._pre(data.img)
         output = self.model(data_tensor)
         return self._post(output)
 
+    @PipelineHandler.run(resources={"memory": 1 * 1024 * 1024 *1024}) # 1G Memory
     def evaluate(self, ppl_result):
         result, label, pr = [], [], []
         for _data in ppl_result:

diff --git a/docs/reference/sdk/job.md b/docs/reference/sdk/job.md
@@ -0,0 +1,188 @@
+---
+title: Starwhale Task SDK
+---
+
+## job
+
+Get a `starwhale.Job` object through the Job URI parameter, which represents a Job on Standalone/Server/Cloud instances.
+
+```python
+@classmethod
+def job(
+    cls,
+    uri: str,
+) -> Job:
+```
+
+### Parameters {#job-func-params}
+
+- `uri`: (str, required)
+  - Job URI format.
+
+### Usage Example {#job-func-example}
+
+```python
+from starwhale import job
+
+# get job object of uri=https://server/job/1
+j1 = job("https://server/job/1")
+
+# get job from standalone instance
+j2 = job("local/project/self/job/xm5wnup")
+j3 = job("xm5wnup")
+```
+
+## class starwhale.Job
+
+`starwhale.Job` abstracts Starwhale Job and enables some information retrieval operations on the job.
+
+### list
+
+`list` is a classmethod that can list the jobs under a project.
+
+```python
+@classmethod
+def list(
+    cls,
+    project: str = "",
+    page_index: int = DEFAULT_PAGE_IDX,
+    page_size: int = DEFAULT_PAGE_SIZE,
+) -> Tuple[List[Job], Dict]:
+```
+
+### Parameters {#list-params}
+
+- `project`: (str, optional)
+  - Project URI, can be projects on Standalone/Server/Cloud instances.
+  - If `project` is not specified, the project selected by `swcli project selected` will be used.
+- `page_index`: (int, optional)
+  - When getting the jobs list from Server/Cloud instances, paging is supported. This parameter specifies the page number.
+    - Default is 1.
+    - Page numbers start from 1.
+  - Standalone instances do not support paging. This parameter has no effect.
+- `page_size`: (int, optional)
+  - When getting the jobs list from Server/Cloud instances, paging is supported. This parameter specifies the number of jobs returned per page.
+    - Default is 1.
+    - Page numbers start from 1.
+  - Standalone instances do not support paging. This parameter has no effect.
+
+### Usage Example {#list-example}
+
+```python
+from starwhale import Job
+
+# list jobs of current selected project
+jobs, pagination_info = Job.list()
+
+# list jobs of starwhale/public project in the cloud.starwhale.cn instance
+jobs, pagination_info = Job.list("https://cloud.starwhale.cn/project/starwhale:public")
+
+# list jobs of id=1 project in the server instance, page index is 2, page size is 10
+jobs, pagination_info = Job.list("https://server/project/1", page_index=2, page_size=10)
+```
+
+## get
+
+`get` is a classmethod that gets information about a specific job and returns a `Starwhale.Job` object. It has the same functionality and parameter definitions as the `starwhale.job` function.
+
+### Usage Example {#get-example}
+
+```python
+from starwhale import Job
+
+# get job object of uri=https://server/job/1
+j1 = Job.get("https://server/job/1")
+
+# get job from standalone instance
+j2 = Job.get("local/project/self/job/xm5wnup")
+j3 = Job.get("xm5wnup")
+```
+
+## summary
+
+`summary` is a property that returns the data written to the summary table during the job execution, in dict type.
+
+```python
+@property
+def summary(self) -> Dict[str, Any]:
+```
+
+### Usage Example {#summary-example}
+
+```python
+from starwhale import jobs
+
+j1 = job("https://server/job/1")
+
+print(j1.summary)
+```
+
+### tables
+
+`tables` is a property that returns the names of tables created during the job execution (not including the summary table, which is created automatically at the project level), in list type.
+
+```python
+@property
+def tables(self) -> List[str]:
+```
+
+### Usage Example {#tables-example}
+
+```python
+from starwhale import jobs
+
+j1 = job("https://server/job/1")
+
+print(j1.tables)
+```
+
+## get_table_rows
+
+`get_table_rows` is a method that returns records from a data table according to the table name and other parameters, in iterator type.
+
+```python
+def get_table_rows(
+    self,
+    name: str,
+    start: Any = None,
+    end: Any = None,
+    keep_none: bool = False,
+    end_inclusive: bool = False,
+) -> Iterator[Dict[str, Any]]:
+```
+
+### Parameters {#rows-params}
+
+- `name`: (str, required)
+  - Datastore table name. The one of table names obtained through the `tables` property is ok.
+- `start`: (Any, optional)
+  - The starting ID value of the returned records.
+  - Default is None, meaning start from the beginning of the table.
+- `end`: (Any, optional)
+  - The ending ID value of the returned records.
+  - Default is None, meaning until the end of the table.
+  - If both `start` and `end` are None, all records in the table will be returned as an iterator.
+- `keep_none`: (bool, optional)
+  - Whether to return records with `None` values.
+  - Default is False.
+- `end_inclusive`: (bool, optional)
+  - When `end` is set, whether the iteration includes the `end` record.
+  - Default is False.
+
+### Usage Example
+
+```python
+from starwhale import job
+
+j = job("local/project/self/job/xm5wnup")
+
+table_name = j.tables[0]
+
+for row in j.get_table_rows(table_name):
+  print(row)
+
+rows = list(j.get_table_rows(table_name, start=0, end=100))
+
+# return the first record from the results table
+result = list(j.get_table_rows('results', start=0, end=1))[0]
+```
diff --git a/docs/reference/sdk/overview.md b/docs/reference/sdk/overview.md
@@ -10,6 +10,7 @@ Starwhale provides a series of Python SDKs to help manage datasets, models, eval
 - `Context`: Passes context information during model evaluation, including Project, Task ID etc.
 - `class Dataset`: Starwhale Dataset class.
 - `class starwhale.api.service.Service`: The base class of online evaluation.
+- `class Job`: Provides operations for Job.
 
 ## Functions
 
@@ -27,6 +28,8 @@ Starwhale provides a series of Python SDKs to help manage datasets, models, eval
 - `@starwhale.api.service.api`: Decorator to provide a simple Web Handler input definition based on Gradio.
 - `login`: Log in to the server/cloud instance.
 - `logout`: Log out of the server/cloud instance.
+- `job`: Get `starwhale.Job` object by the Job URI.
+- `@PipelineHandler.run`: Decorator to define the resources for the predict and evaluate methods in PipelineHandler subclasses.
 
 ## Data Types
 
@@ -55,5 +58,6 @@ Starwhale provides a series of Python SDKs to help manage datasets, models, eval
 
 - If you want to build and manage datasets, read about the [Starwhale Dataset SDK](dataset) and [Starwhale Data Types](type).
 - If you want to write model evaluation and fine-tuning programs, read about the [Starwhale Evaluation SDK](evaluation).
+- If you want to get job info and evaluation results, read about the [Starwhale Job SDK](job).
 - If you want to build and manage Starwhale model packages through Python scripts, read about the [Starwhale Model SDK](model).
 - If you want to learn about other utility functions, read about [Other SDKs](other).
diff --git a/docs/reference/swcli/dataset.md b/docs/reference/swcli/dataset.md
@@ -41,8 +41,9 @@ Build Starwhale Dataset. This command only supports to build standalone dataset.
 |`-vf` or `--video` or `--video-folder`| N | String | | Build dataset from video folder, the folder should contain the video files. |
 |`-h` or `--handler` or `--python-handler`| N | String | | Build dataset from python executor handler, the handler format is [module path]:[class or func name]. |
 |`-f` or `--yaml` or `--dataset-yaml`| N | dataset.yaml in cwd | | Build dataset from dataset.yaml file. Default uses dataset.yaml in the work directory(cwd). |
-|`-jf` or `--json-file`| N | String | | Build dataset from json file, the json file option is a json file path or a http downloaded url.The json content structure should be a list[dict] or tuple[dict]. |
+|`-jf` or `--json`| N | String | | Build dataset from json or jsonl file, the json or jsonl file option is a json file path or a http downloaded url.The json content structure should be a list[dict] or tuple[dict]. |
 |`-hf` or `--huggingface`| N | String | | Build dataset from huggingface dataset, the huggingface option is a huggingface repo name. |
+|`-c` or `--csv`| N | String | | Build dataset from csv files. The option is a csv file path, dir path or a http downloaded url.The option can be used multiple times.|
 
 **Data source options are mutually exclusive, only one option is accepted.** If no set, `swcli dataset build` command will use dataset yaml mode to build dataset with the `dataset.yaml` in the cwd.
 
@@ -61,11 +62,18 @@ Build Starwhale Dataset. This command only supports to build standalone dataset.
 | `-w` or `--workdir` | N | Python Handler Mode | String | cwd |  work dir to search handler. |
 | `--auto-label`/`--no-auto-label` | N | Image/Video/Audio Folder Mode | Boolean | True | Whether to auto label by the sub-folder name. |
 | `--field-selector` | N | JSON File Mode | String | | The filed from which you would like to extract dataset array items. The filed is split by the dot(.) symbol. |
-| `--subset` | N | Huggingface Mode | String | | Huggingface dataset subset name. If the huggingface dataset has multiple subsets, you must specify the subset name. |
-| `--split` | N | Huggingface Mode | String | | Huggingface dataset split name. If the split name is not specified, the all splits dataset will be built.  |
+| `--subset` | N | Huggingface Mode | String | | Huggingface dataset subset name. If the subset name is not specified, the all subsets will be built. |
+| `--split` | N | Huggingface Mode | String | | Huggingface dataset split name. If the split name is not specified, the all splits will be built.  |
 | `--revision` | N | Huggingface Mode | String | main | Version of the dataset script to load. Defaults to 'main'. The option value accepts tag name, or branch name, or commit hash. |
+| `--add-hf-info`/`--no-add-hf-info` | N | Huggingface Mode | Boolean | True | Whether to add huggingface dataset info to the dataset rows, currently support to add subset and split into the dataset rows. Subset uses `_hf_subset` field name, split uses `_hf_split` field name.|
 | `--cache`/`--no-cache` | N | Huggingface Mode | Boolean | True | Whether to use huggingface dataset cache(download + local hf dataset). |
 | `-t` or `--tag` | N | Global | String | | Dataset tags, the option can be used multiple times. |
+| `--encoding` | N | CSV/JSON/JSONL Mode | String | | file encoding. |
+| `--dialect` | N | CSV Mode | String | `excel` | The csv file dialect, the default is `excel`. Current supports `excel`, `excel-tab` and `unix` formats. |
+| `--delimiter` | N | CSV Mode | String | `,` | A one-character string used to separate fields for the csv file. |
+| `--quotechar` | N | CSV Mode | String | `"` | A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. |
+| `--skipinitialspace`/`--no-skipinitialspace` | N | CSV Mode | Bool | False | Whether to skip spaces after delimiter for the csv file. |
+| `--strict`/`--no-strict` | N | CSV Mode | Bool | False | When True, raise exception Error if the csv is not well formed.|
 
 ### Examples for dataset building
 
@@ -90,16 +98,25 @@ swcli dataset build --audio-folder /path/to/audio/folder  # build dataset from /
 #- from video folder
 swcli dataset build --video-folder /path/to/video/folder  # build dataset from /path/to/video/folder, search all video type files.
 
-#- from json file
-swcli dataset build --json-file /path/to/example.json
-swcli dataset build --json-file http://example.com/example.json
-swcli dataset build --json-file /path/to/example.json --field-selector a.b.c # extract the json_content["a"]["b"]["c"] field from the json file.
-swcli dataset build --name qald9 --json-file https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-test-multilingual.json --field-selector questions
+#- from json/jsonl file
+swcli dataset build --json /path/to/example.json 
+swcli dataset build --json http://example.com/example.json 
+swcli dataset build --json /path/to/example.json --field-selector a.b.c # extract the json_content["a"]["b"]["c"] field from the json file.  
+swcli dataset build --name qald9 --json https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-test-multilingual.json --field-selector questions 
+swcli dataset build --json /path/to/test01.jsonl --json /path/to/test02.jsonl 
+swcli dataset build --json https://modelscope.cn/api/v1/datasets/damo/100PoisonMpts/repo\?Revision\=master\&FilePath\=train.jsonl
 
 #- from huggingface dataset
 swcli dataset build --huggingface mnist
 swcli dataset build -hf mnist --no-cache
 swcli dataset build -hf cais/mmlu --subset anatomy --split auxiliary_train --revision 7456cfb
+
+#- from csv files
+swcli dataset build --csv /path/to/example.csv 
+swcli dataset build --csv /path/to/example.csv --csv-file /path/to/example2.csv 
+swcli dataset build --csv /path/to/csv-dir  
+swcli dataset build --csv http://example.com/example.csv 
+swcli dataset build --name product-desc-modelscope --csv https://modelscope.cn/api/v1/datasets/lcl193798/product_description_generation/repo\?Revision\=master\&FilePath\=test.csv --encoding=utf-8-sig
 ```
 
 ## swcli dataset copy {#copy}
@@ -318,11 +335,11 @@ Each dataset version can have any number of tags， but duplicated tag names are
 swcli dataset tag mnist
 
 #- add tags for the mnist dataset
-swcli dataset tag mnist -t t1 -t t2
-swcli dataset tag cloud://cloud.starwhale.cn/project/public:starwhale/dataset/mnist/version/latest -t t1 --force-add
-swcli dataset tag mnist -t t1 --quiet
+swcli dataset tag mnist t1 t2
+swcli dataset tag cloud://cloud.starwhale.cn/project/public:starwhale/dataset/mnist/version/latest t1 --force-ad
+swcli dataset tag mnist t1 --quiet
 
 #- remove tags for the mnist dataset
-swcli dataset tag mnist -r -t t1 -t t2
-swcli dataset tag cloud://cloud.starwhale.cn/project/public:starwhale/dataset/mnist --remove -t t1
+swcli dataset tag mnist -r t1 t2
+swcli dataset tag cloud://cloud.starwhale.cn/project/public:starwhale/dataset/mnist --remove t1
 ```
diff --git a/docs/reference/swcli/job.md b/docs/reference/swcli/job.md
@@ -42,11 +42,6 @@ swcli [GLOBAL OPTIONS] job info [OPTIONS] <JOB>
 
 `JOB` is a [job URI](../../swcli/uri.md#job).
 
-| Option | Required | Type | Defaults | Description |
-| --- | --- | --- | --- | --- |
-| `--page` | N | Integer | 1 | The starting page number.  Server and cloud instances only. |
-| `--size` | N | Integer | 20 | The number of items in one page. Server and cloud instances only. |
-
 ## swcli job list {#list}
 
 ```bash

diff --git a/docs/reference/swcli/model.md b/docs/reference/swcli/model.md
@@ -357,11 +357,11 @@ Each model version can have any number of tags， but duplicated tag names are n
 swcli model tag mnist
 
 #- add tags for the mnist model
-swcli model tag mnist -t t1 -t t2
-swcli model tag cloud://cloud.starwhale.cn/project/public:starwhale/model/mnist/version/latest -t t1 --force-add
-swcli model tag mnist -t t1 --quiet
+swcli model tag mnist t1 t2
+swcli model tag cloud://cloud.starwhale.cn/project/public:starwhale/model/mnist/version/latest t1 --force-add
+swcli model tag mnist t1 --quiet
 
 #- remove tags for the mnist model
-swcli model tag mnist -r -t t1 -t t2
-swcli model tag cloud://cloud.starwhale.cn/project/public:starwhale/model/mnist --remove -t t1
+swcli model tag mnist -r t1 t2
+swcli model tag cloud://cloud.starwhale.cn/project/public:starwhale/model/mnist --remove t1
 ```
diff --git a/docs/reference/swcli/runtime.md b/docs/reference/swcli/runtime.md
@@ -305,11 +305,11 @@ Each runtime version can have any number of tags， but duplicated tag names are
 swcli runtime tag pytorch
 
 #- add tags for the pytorch runtime
-swcli runtime tag mnist -t t1 -t t2
-swcli runtime tag cloud://cloud.starwhale.cn/project/public:starwhale/runtime/pytorch/version/latest -t t1 --force-add
-swcli runtime tag mnist -t t1 --quiet
+swcli runtime tag mnist t1 t2
+swcli runtime tag cloud://cloud.starwhale.cn/project/public:starwhale/runtime/pytorch/version/latest t1 --force-add
+swcli runtime tag mnist t1 --quiet
 
 #- remove tags for the pytorch runtime
-swcli runtime tag mnist -r -t t1 -t t2
-swcli runtime tag cloud://cloud.starwhale.cn/project/public:starwhale/runtime/pytorch --remove -t t1
+swcli runtime tag mnist -r t1 t2
+swcli runtime tag cloud://cloud.starwhale.cn/project/public:starwhale/runtime/pytorch --remove t1
 ```