Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fallback to starters pull on kedro new #3900

Merged
merged 40 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
48064c5
Add fallback to starters pull on kedro new
lrcouto May 29, 2024
791e821
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto May 29, 2024
2ffa441
Lint
lrcouto May 29, 2024
4bd7ec9
Add types-requests for mypy
lrcouto May 29, 2024
41c7a60
allow checkout flag to be used for different starters version
lrcouto Jun 4, 2024
aa42e54
Attempt to fix e2e tests
lrcouto Jun 4, 2024
11e599b
Store starters version on env variable
lrcouto Jun 5, 2024
a53b459
Add unit tests
lrcouto Jun 5, 2024
b1b03ea
Fix unit tests
lrcouto Jun 5, 2024
6aca1f4
Add requests_mock
lrcouto Jun 5, 2024
2572d3d
Catch exception if request fails
lrcouto Jun 5, 2024
6867f92
Lint
lrcouto Jun 5, 2024
5088d84
Change error condition on _get_latest_starters_version
lrcouto Jun 5, 2024
18790ad
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 6, 2024
15ba4da
Change env variable name to be more specific
lrcouto Jun 6, 2024
d2358af
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 12, 2024
c3bc522
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 13, 2024
2a3572f
Set all starter options to pull from main if versions don't match
lrcouto Jun 14, 2024
4a11ed2
Add tests to cover changes on _make_cookiecutter_args_and_fetch_template
lrcouto Jun 14, 2024
8298dbd
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 18, 2024
a8d3631
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 19, 2024
661b124
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 20, 2024
c10b0ba
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 20, 2024
7f658dd
remove redundant assignments
lrcouto Jun 21, 2024
486671d
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 25, 2024
3e35466
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jun 27, 2024
92b0058
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 1, 2024
4077bad
Changes on logic for fetching templates
lrcouto Jul 1, 2024
c7b261d
Lint
lrcouto Jul 1, 2024
4ee2587
Update test to match changes on template fetching
lrcouto Jul 1, 2024
5cd8dc7
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 2, 2024
715afe6
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 4, 2024
c30e6eb
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 5, 2024
eaacab8
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 8, 2024
c279c6a
Change logic of version comparison, add tests
lrcouto Jul 8, 2024
d6246b3
Merge branch 'fallback-to-starters-git-pull' of github.com:kedro-org/…
lrcouto Jul 8, 2024
9ca8844
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 9, 2024
34e60ba
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 10, 2024
5a9142c
Merge branch 'main' into fallback-to-starters-git-pull
lrcouto Jul 10, 2024
8e4c8f0
Extract checkout logic to its own function
lrcouto Jul 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 50 additions & 7 deletions kedro/framework/cli/starters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
from __future__ import annotations

import logging
import os
import re
import shutil
Expand All @@ -17,6 +18,7 @@
from typing import Any, Callable

import click
import requests
import yaml
from attrs import define, field
from importlib_metadata import EntryPoints
Expand Down Expand Up @@ -95,7 +97,44 @@ class KedroStarterSpec:
KEDRO_PATH = Path(kedro.__file__).parent
TEMPLATE_PATH = KEDRO_PATH / "templates" / "project"

_STARTERS_REPO = "git+https://github.com/kedro-org/kedro-starters.git"

def _get_latest_starters_version() -> str:
if "KEDRO_STARTERS_VERSION" not in os.environ:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
headers = {}
if GITHUB_TOKEN:
headers["Authorization"] = f"token {GITHUB_TOKEN}"

try:
response = requests.get(
"https://api.github.com/repos/kedro-org/kedro-starters/releases/latest",
headers=headers,
timeout=10,
)
response.raise_for_status() # Raise an HTTPError for bad status codes
latest_release = response.json()
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching kedro-starters latest release version: {e}")
return ""

os.environ["KEDRO_STARTERS_VERSION"] = latest_release["tag_name"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We started discussing the use of environment variables here to optimise time for different runs, such as in CI/CD pipelines. However, I read the following about os.environ:

The environment variable set using os.environ within a Python script is only set for the duration of the current process (i.e., the script execution) and does not persist beyond that. It will not be available to other processes or sessions, and once the script finishes executing, the environment variable will be lost.

If this is correct, it doesn't make sense to set this type of environment variable. Have you tested that it works?

return str(latest_release["tag_name"])
else:
return str(os.getenv("KEDRO_STARTERS_VERSION"))


def _kedro_and_starters_version_identical() -> bool:
starters_version = _get_latest_starters_version()
return True if version == starters_version else False


_STARTERS_REPO = (
"git+https://github.com/kedro-org/kedro-starters.git"
if _kedro_and_starters_version_identical()
else "https://github.com/kedro-org/kedro-starters.git@main"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if we point to the main branch (_STARTERS_REPO="https://github.com/kedro-org/kedro-starters.git@main") but then below in _make_cookiecutter_args_and_fetch_template we set the checkout version as well? For example in this case:

if "PySpark" in tools and "Kedro Viz" in tools:
        # Use the spaceflights-pyspark-viz starter if both PySpark and Kedro Viz are chosen.
        cookiecutter_args["directory"] = "spaceflights-pyspark-viz"
        # Ensures we use the same tag version of kedro for kedro-starters
        cookiecutter_args["checkout"] = version

Will it then work as we expect I mean creating a project and passing the branch and the checkout version to the cookiecutter?

def _create_project(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will lead us to cookiecutter shenanigans, which is where these parameters are ultimately used. You have to dig down four or five functions to see it, but cookiecutter is using the template parameter to determine the repo address and the checkout parameter to determine the branch, tag or commit ID. What it does is the following:

    if clone:
        try:
            subprocess.check_output(  # nosec
                [repo_type, 'clone', repo_url],
                cwd=clone_to_dir,
                stderr=subprocess.STDOUT,
            )
            if checkout is not None:
                checkout_params = [checkout]
                # Avoid Mercurial "--config" and "--debugger" injection vulnerability
                if repo_type == "hg":
                    checkout_params.insert(0, "--")
                subprocess.check_output(  # nosec
                    [repo_type, 'checkout', *checkout_params],
                    cwd=repo_dir,
                    stderr=subprocess.STDOUT,
                )

So if you passed the repo_url as the address to the main branch and, for example, passed the checkout value as a different branch, cookiecutter would clone the main branch and then checkout to the other branch.

Copy link
Contributor

@ElenaKhaustova ElenaKhaustova Jun 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for explaining, that's what I thought as well. Doesn't it mean that in this case instead of main branch (which we aim for) we again will have the released branch (as the checkout argument is the installed kedro version in case it's not passed) which is not desired?

checkout = checkout or version

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point, let me test that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think you're right. The stuff passed to cookiecutter has to be formatted in a different way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking of refactoring the logic in _make_cookiecutter_args_and_fetch_template because now we have two places where we define the template which is a bit confusing. But that can be done in a separate PR.

Now, we can just identify the case when repos versions are not matched (as you do for setting the _STARTERS_REPO). In this case, we only need to use the template path provided but skip setting up the checkout parameter in _make_cookiecutter_args_and_fetch_template.

Copy link
Contributor

@ElenaKhaustova ElenaKhaustova Jun 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or we can just do something like

checkout = checkout or version if _kedro_and_starters_version_identical() else None

checkout = checkout or version

Edit: updated link

Copy link
Contributor

@DimedS DimedS Jul 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please comment why do we need that two lines here now:

    if _kedro_version_equal_or_lower_to_starters(version)
    else "https://github.com/kedro-org/kedro-starters.git@main"

)


_OFFICIAL_STARTER_SPECS = [
KedroStarterSpec("astro-airflow-iris", _STARTERS_REPO, "astro-airflow-iris"),
KedroStarterSpec("spaceflights-pandas", _STARTERS_REPO, "spaceflights-pandas"),
Expand Down Expand Up @@ -766,35 +805,39 @@ def _make_cookiecutter_args_and_fetch_template(
"extra_context": config,
}

if checkout:
cookiecutter_args["checkout"] = checkout
kedro_version_match_starters = _kedro_and_starters_version_identical()

if directory:
cookiecutter_args["directory"] = directory

tools = config["tools"]
example_pipeline = config["example_pipeline"]
starter_path = "git+https://github.com/kedro-org/kedro-starters.git"
checkout_version = version if kedro_version_match_starters else "main"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that the checkout key should be different for the cases below? In my understanding, we should:

  1. Use the checkout argument passed
  2. In case the checkout is None go with version if kedro version matches the starters
  3. Use main otherwise

Now we're mixing checkout and version which is a bit confusing. Please correct me if I'm missing any other logic behind it or if there's any particular reason why we don't want to use the provided checkout argument in case some tools are selected.


if "PySpark" in tools and "Kedro Viz" in tools:
# Use the spaceflights-pyspark-viz starter if both PySpark and Kedro Viz are chosen.
cookiecutter_args["directory"] = "spaceflights-pyspark-viz"
# Ensures we use the same tag version of kedro for kedro-starters
cookiecutter_args["checkout"] = version
cookiecutter_args["checkout"] = checkout_version
elif "PySpark" in tools:
# Use the spaceflights-pyspark starter if only PySpark is chosen.
cookiecutter_args["directory"] = "spaceflights-pyspark"
cookiecutter_args["checkout"] = version
cookiecutter_args["checkout"] = checkout_version
elif "Kedro Viz" in tools:
# Use the spaceflights-pandas-viz starter if only Kedro Viz is chosen.
cookiecutter_args["directory"] = "spaceflights-pandas-viz"
merelcht marked this conversation as resolved.
Show resolved Hide resolved
cookiecutter_args["checkout"] = checkout_version
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Merel, it's better to put cookiecutter_args["checkout"] = checkout_version on top and write it only once, if we can handle that section properly:

else:
        # Use the default template path for non PySpark, Viz or example options:
        starter_path = template_path

could you please explain checkout logic with that part, what should we do here in terms of checkout? As I understood here we are taking standard kedro template from kedro repo?

elif example_pipeline == "True":
# Use spaceflights-pandas starter if example was selected, but PySpark or Viz wasn't
cookiecutter_args["directory"] = "spaceflights-pandas"
cookiecutter_args["checkout"] = version
cookiecutter_args["checkout"] = checkout_version
else:
# Use the default template path for non PySpark, Viz or example options:
starter_path = template_path

cookiecutter_args["checkout"] = (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to the previous comment.

If the checkout argument is provided and kedro version doesn't match starters, do we indeed want to sick to main?

In this case, there's also a discrepancy with _get_cookiecutter_dir method, where we use the provided checkout argument.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this just be cookiecutter_args["checkout"] = checkout_version as well?

checkout if checkout and kedro_version_match_starters else "main"
)
return cookiecutter_args, starter_path


Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,14 @@ test = [
"pytest-mock>=1.7.1, <4.0",
"pytest-xdist[psutil]~=2.2.1",
"pytest>=7.2,<9.0",
"s3fs>=2021.4",
"s3fs>=2021.4, <2024.6", # Upper bound set arbitrarily, to be reassessed in early 2024
"requests_mock",
"trufflehog~=2.1",
# mypy related dependencies
"pandas-stubs",
"types-PyYAML",
"types-cachetools",
"types-requests",
"types-toml"
]
docs = [
Expand Down
175 changes: 175 additions & 0 deletions tests/framework/cli/test_starters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
"""
from __future__ import annotations

import logging
import os
import shutil
from pathlib import Path

import pytest
import requests
import toml
import yaml
from click.testing import CliRunner
Expand All @@ -18,6 +21,7 @@
KedroStarterSpec,
_convert_tool_short_names_to_numbers,
_fetch_validate_parse_config_from_user_prompts,
_get_latest_starters_version,
_make_cookiecutter_args_and_fetch_template,
_parse_tools_input,
_parse_yes_no_to_bool,
Expand Down Expand Up @@ -1687,3 +1691,174 @@ def test_flag_is_absent(self, fake_kedro_cli):

telemetry_file_path = Path(repo_name + "/.telemetry")
assert not telemetry_file_path.exists()


@pytest.fixture
def mock_env_vars(mocker):
"""Fixture to mock environment variables"""
mocker.patch.dict(os.environ, {"GITHUB_TOKEN": "fake_token"}, clear=True)


class TestGetLatestStartersVersion:
def test_get_latest_starters_version(self, mock_env_vars, requests_mock):
"""Test _get_latest_starters_version when KEDRO_STARTERS_VERSION is not set"""
latest_version = "1.2.3"

# Mock the GitHub API response
requests_mock.get(
"https://api.github.com/repos/kedro-org/kedro-starters/releases/latest",
json={"tag_name": latest_version},
status_code=200,
)

result = _get_latest_starters_version()

assert result == latest_version
assert os.getenv("KEDRO_STARTERS_VERSION") == latest_version

def test_get_latest_starters_version_with_env_set(self, mocker):
"""Test _get_latest_starters_version when KEDRO_STARTERS_VERSION is already set"""
expected_version = "1.2.3"
mocker.patch.dict(os.environ, {"KEDRO_STARTERS_VERSION": expected_version})

result = _get_latest_starters_version()

assert result == expected_version

def test_get_latest_starters_version_error(
self, mock_env_vars, requests_mock, caplog
):
"""Test _get_latest_starters_version when the request raises an exception"""
with caplog.at_level(logging.ERROR):
# Mock the request to raise a RequestException
requests_mock.get(
"https://api.github.com/repos/kedro-org/kedro-starters/releases/latest",
exc=requests.exceptions.RequestException("Request failed"),
)

result = _get_latest_starters_version()

assert result == ""
assert (
"Error fetching kedro-starters latest release version: Request failed"
in caplog.text
)


class TestFetchCookiecutterArgsWhenKedroVersionDifferentFromStarters:
@pytest.fixture
def config(self):
return {
"tools": [],
"example_pipeline": "False",
"output_dir": "/my/output/dir",
}

def test_kedro_version_match_starters_true(self, config, mocker):
mocker.patch(
"kedro.framework.cli.starters._kedro_and_starters_version_identical",
return_value=True,
)

config["tools"] = ["PySpark", "Kedro Viz"]
checkout = ""
directory = "my-directory"
template_path = "my/template/path"

expected_args = {
"output_dir": "/my/output/dir",
"no_input": True,
"extra_context": config,
"checkout": version,
"directory": "spaceflights-pyspark-viz",
}
expected_path = "git+https://github.com/kedro-org/kedro-starters.git"

result_args, result_path = _make_cookiecutter_args_and_fetch_template(
config, checkout, directory, template_path
)

assert result_args == expected_args
assert result_path == expected_path

def test_kedro_version_match_starters_false(self, config, mocker):
mocker.patch(
"kedro.framework.cli.starters._kedro_and_starters_version_identical",
return_value=False,
)

config["tools"] = ["PySpark", "Kedro Viz"]
checkout = "my-checkout"
directory = "my-directory"
template_path = "my/template/path"

expected_args = {
"output_dir": "/my/output/dir",
"no_input": True,
"extra_context": config,
"checkout": "main",
"directory": "spaceflights-pyspark-viz",
}
expected_path = "git+https://github.com/kedro-org/kedro-starters.git"

result_args, result_path = _make_cookiecutter_args_and_fetch_template(
config, checkout, directory, template_path
)

assert result_args == expected_args
assert result_path == expected_path

def test_no_tools_and_example_pipeline(self, config, mocker):
mocker.patch(
"kedro.framework.cli.starters._kedro_and_starters_version_identical",
return_value=False,
)

config["tools"] = []
config["example_pipeline"] = "True"
checkout = ""
directory = ""
template_path = "my/template/path"

expected_args = {
"output_dir": "/my/output/dir",
"no_input": True,
"extra_context": config,
"checkout": "main",
"directory": "spaceflights-pandas",
}
expected_path = "git+https://github.com/kedro-org/kedro-starters.git"

result_args, result_path = _make_cookiecutter_args_and_fetch_template(
config, checkout, directory, template_path
)

assert result_args == expected_args
assert result_path == expected_path

def test_no_tools_no_example_pipeline(self, config, mocker):
mocker.patch(
"kedro.framework.cli.starters._kedro_and_starters_version_identical",
return_value=False,
)

config["tools"] = []
config["example_pipeline"] = "False"
checkout = ""
directory = ""
template_path = "my/template/path"

expected_args = {
"output_dir": "/my/output/dir",
"no_input": True,
"extra_context": config,
"checkout": "main",
}
expected_path = "my/template/path"

result_args, result_path = _make_cookiecutter_args_and_fetch_template(
config, checkout, directory, template_path
)

assert result_args == expected_args
assert result_path == expected_path