Skip to content

Commit

Permalink
Project import generated by Copybara.
Browse files Browse the repository at this point in the history
GitOrigin-RevId: fd1e1cd39b9a74a1b269fb9716dc8b491e75f6ac
  • Loading branch information
Gretel Team authored and johnnygreco committed Nov 21, 2024
1 parent ac55e06 commit d3a1070
Show file tree
Hide file tree
Showing 14 changed files with 1,253 additions and 389 deletions.
6 changes: 5 additions & 1 deletion src/gretel_client/navigator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
from gretel_client.navigator.data_designer.sample_to_dataset import (
DataDesignerFromSampleRecords,
)
from gretel_client.navigator.data_designer.factory import DataDesignerFactory
from gretel_client.navigator.data_designer.interface import DataDesigner
from gretel_client.navigator.workflow import NavigatorWorkflow
from gretel_client.navigator.workflow import DataDesignerWorkflow
1 change: 0 additions & 1 deletion src/gretel_client/navigator/client/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ def submit_batch_workflow(
logger.info(f"▶️ Starting your workflow run to generate {num_records} records:")
logger.info(f" |-- project_name: {project.name}")
logger.info(f" |-- project_id: {project.project_guid}")
logger.info(f" |-- workflow_id: {batch_response.workflow_id}")
logger.info(f" |-- workflow_run_id: {batch_response.workflow_run_id}")
logger.info(f"🔗 -> {workflow_run_url}")

Expand Down
27 changes: 20 additions & 7 deletions src/gretel_client/navigator/data_designer/data_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,25 @@ def generate_context_column_string(self, exclude: Optional[set[str]] = None) ->
+ "\n"
)

def get_system_prompt(
self, special_system_instructions: Optional[str] = None
) -> str:
"""Get the system prompt for the column generation task.
Args:
special_instructions: Special instructions to be added to the system prompt.
Returns:
System prompt string.
"""
return system_prompt_dict[self.llm_type].format(
special_instructions=(
""
if special_system_instructions is None
else f"\n{special_system_instructions}\n"
)
)

def to_generation_task(
self,
special_system_instructions: Optional[str] = None,
Expand All @@ -137,13 +156,7 @@ def to_generation_task(
response_column_name=self.name,
workflow_label=f"generating {self.name}",
llm_type=self.llm_type,
system_prompt=system_prompt_dict[self.llm_type].format(
special_instructions=(
""
if special_system_instructions is None
else f"\n{special_system_instructions}\n"
)
),
system_prompt=self.get_system_prompt(special_system_instructions),
client=client,
)

Expand Down
121 changes: 121 additions & 0 deletions src/gretel_client/navigator/data_designer/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import logging

from pathlib import Path
from typing import Optional, Union

import pandas as pd

from gretel_client.navigator.data_designer.interface import DataDesigner
from gretel_client.navigator.data_designer.sample_to_dataset import (
DataDesignerFromSampleRecords,
)
from gretel_client.navigator.log import get_logger
from gretel_client.navigator.tasks.types import (
DEFAULT_MODEL_SUITE,
ModelSuite,
RecordsT,
)

logger = get_logger(__name__, level=logging.INFO)


class DataDesignerFactory:
"""Factory class for creating DataDesigner instances.
Each class method on this object provides a different way to instantiate
a DataDesigner object, depending on your use case and desired workflow.
Allowed session keyword arguments:
api_key (str): Your Gretel API key. If set to "prompt" and no API key
is found on the system, you will be prompted for the key.
endpoint (str): Specifies the Gretel API endpoint. This must be a fully
qualified URL. The default is "https://api.gretel.cloud".
default_runner (str): Specifies the runner mode. Must be one of "cloud",
"local", "manual", or "hybrid". The default is "cloud".
artifact_endpoint (str): Specifies the endpoint for project and model
artifacts. Defaults to "cloud" for running in Gretel Cloud. If
working in hybrid mode, set to the URL of your artifact storage bucket.
cache (str): Valid options are "yes" or "no". If set to "no", the session
configuration will not be written to disk. If set to "yes", the
session configuration will be written to disk only if one doesn't
already exist. The default is "no".
validate (bool): If `True`, will validate the login credentials at
instantiation. The default is `False`.
clear (bool): If `True`, existing Gretel credentials will be removed.
The default is `False.`
"""

@classmethod
def from_blank_canvas(
cls, model_suite: ModelSuite = DEFAULT_MODEL_SUITE, **kwargs
) -> DataDesigner:
"""Instantiate an empty DataDesigner instance that can be built up programmatically.
This initialization method is equivalent to directly instantiating a DataDesigner object.
Args:
model_suite: The model suite to use for generating synthetic data. Defaults to the
apache-2.0 licensed model suite.
**kwargs: Additional keyword arguments to pass to the DataDesigner constructor.
Returns:
An instance of DataDesigner with a blank canvas.
"""
logger.info("🎨 Creating DataDesigner instance from blank canvas")

return DataDesigner(model_suite=model_suite, **kwargs)

@classmethod
def from_config(cls, config: dict, **kwargs) -> DataDesigner:
"""Instantiate a DataDesigner instance from a configuration dictionary.
This method allows you to specify your data design using a YAML configuration file,
which is then built into a DataDesigner instance the same way you would do so programmatically.
Args:
config: A YAML configuration file, dict, or string that fully specifies the data design.
**kwargs: Additional keyword arguments to pass to the DataDesigner constructor.
Returns:
An instance of DataDesigner configured with the data seeds and generated data columns
defined in the configuration dictionary.
"""
logger.info("🎨 Creating DataDesigner instance from config")

return DataDesigner.from_config(config, **kwargs)

@classmethod
def from_sample_records(
cls,
sample_records: Union[str, Path, pd.DataFrame, RecordsT],
*,
subsample_size: Optional[int] = None,
model_suite: ModelSuite = DEFAULT_MODEL_SUITE,
**kwargs,
) -> DataDesigner:
"""Instantiate a DataDesigner instance from sample records.
Use this subclass of DataDesigner when you want to turn a few sample records
into a rich, diverse synthetic dataset (Sample-to-Dataset).
Args:
sample_records: Sample records from which categorical data seeds will be extracted
and optionally used to create generated data columns.
subsample_size: The number of records to use from the sample records. If None,
all records will be used. If the subsample size is larger than the sample records,
the full sample will be used.
model_suite: The model suite to use for generating synthetic data. Defaults to the
apache-2.0 licensed model suite.
Returns:
An instance of DataDesigner configured to extract data seeds from the sample records
and optionally create generated data columns for each field in the sample records.
"""
logger.info("🎨 Creating DataDesigner instance from sample records")

return DataDesignerFromSampleRecords(
sample_records=sample_records,
subsample_size=subsample_size,
model_suite=model_suite,
**kwargs,
)
Loading

0 comments on commit d3a1070

Please sign in to comment.