diff --git a/backends/src/test_utils/mod.rs b/backends/src/test_utils/mod.rs index f915379e8..f04022627 100644 --- a/backends/src/test_utils/mod.rs +++ b/backends/src/test_utils/mod.rs @@ -1,2 +1,3 @@ pub mod gateway; +pub mod provisioner; pub mod resource_recorder; diff --git a/backends/src/test_utils/provisioner.rs b/backends/src/test_utils/provisioner.rs new file mode 100644 index 000000000..7816b3136 --- /dev/null +++ b/backends/src/test_utils/provisioner.rs @@ -0,0 +1,51 @@ +use std::net::{Ipv4Addr, SocketAddr}; + +use async_trait::async_trait; +use portpicker::pick_unused_port; +use shuttle_proto::provisioner::{ + provisioner_server::{Provisioner, ProvisionerServer}, + DatabaseDeletionResponse, DatabaseRequest, DatabaseResponse, Ping, Pong, +}; +use tonic::transport::Server; + +struct ProvisionerMock; + +#[async_trait] +impl Provisioner for ProvisionerMock { + async fn provision_database( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> { + panic!("no run tests should request a db"); + } + + async fn delete_database( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> { + panic!("no run tests should delete a db"); + } + + async fn health_check( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> { + panic!("no run tests should do a health check"); + } +} + +/// Start a mocked provisioner and return the port it started on +pub async fn get_mocked_provisioner() -> u16 { + let provisioner = ProvisionerMock; + + let port = pick_unused_port().unwrap(); + let provisioner_addr = SocketAddr::new(Ipv4Addr::UNSPECIFIED.into(), port); + tokio::spawn(async move { + Server::builder() + .add_service(ProvisionerServer::new(provisioner)) + .serve(provisioner_addr) + .await + }); + + port +} diff --git a/backends/src/test_utils/resource_recorder.rs b/backends/src/test_utils/resource_recorder.rs index 447609748..2f7a69e4a 100644 --- a/backends/src/test_utils/resource_recorder.rs +++ b/backends/src/test_utils/resource_recorder.rs @@ -22,7 +22,7 @@ impl ResourceRecorder for MockedResourceRecorder { &self, request: Request, ) -> Result, Status> { - println!("recording resources"); + println!("recording resources: {request:?}"); let RecordRequest { project_id, @@ -149,14 +149,6 @@ impl ResourceRecorder for MockedResourceRecorder { r#type, } = request.into_inner(); - // Fail to delete a metadata resource if requested - if r#type == "metadata" { - return Ok(Response::new(ResultResponse { - success: false, - message: Default::default(), - })); - } - self.resources.lock().unwrap().retain(|r| { !(r.project_id == project_id && r.service_id == service_id && r.r#type == r#type) }); @@ -169,8 +161,7 @@ impl ResourceRecorder for MockedResourceRecorder { } /// Start a mocked resource recorder and return the port it started on -/// This mock will function like a normal resource recorder. However, it will always fail to delete metadata resources -/// if any tests need to simulate a failure. +/// This mock will function like a normal resource recorder. pub async fn get_mocked_resource_recorder() -> u16 { let resource_recorder = MockedResourceRecorder { resources: Mutex::new(Vec::new()), diff --git a/common/src/models/deployment.rs b/common/src/models/deployment.rs index fdbf6a390..29c848ae3 100644 --- a/common/src/models/deployment.rs +++ b/common/src/models/deployment.rs @@ -17,7 +17,7 @@ pub const GIT_STRINGS_MAX_LENGTH: usize = 80; pub const CREATE_SERVICE_BODY_LIMIT: usize = 50_000_000; const GIT_OPTION_NONE_TEXT: &str = "N/A"; -#[derive(Deserialize, Serialize)] +#[derive(Deserialize, Serialize, Debug)] pub struct Response { pub id: Uuid, pub service_id: String, diff --git a/gateway/src/api/latest.rs b/gateway/src/api/latest.rs index 14965ad29..2d2b9f575 100644 --- a/gateway/src/api/latest.rs +++ b/gateway/src/api/latest.rs @@ -40,7 +40,7 @@ use tokio::sync::mpsc::Sender; use tokio::sync::{Mutex, MutexGuard}; use tower::ServiceBuilder; use tower_http::cors::CorsLayer; -use tracing::{error, field, instrument, trace, Span}; +use tracing::{debug, error, field, info, instrument, trace, warn, Span}; use ttl_cache::TtlCache; use ulid::Ulid; use uuid::Uuid; @@ -309,49 +309,30 @@ async fn delete_project( let project_id = Ulid::from_string(&project.id).expect("stored project id to be a valid ULID"); - // Try to startup destroyed, errored or outdated projects - let project_deletable = project.state.is_ready() || project.state.is_stopped(); - let current_version: semver::Version = env!("CARGO_PKG_VERSION") - .parse() - .expect("to have a valid semver gateway version"); - - let version = project - .state - .container() - .and_then(|container_inspect_response| { - container_inspect_response.image.and_then(|inner| { - inner - .strip_prefix("public.ecr.aws/shuttle/deployer:v") - .and_then(|x| x.parse::().ok()) - }) - }) - // Defaulting to a version that introduced a breaking change. - // This was the last one that introduced it at the present - // moment. - .unwrap_or(semver::Version::new(0, 39, 0)); - // We restart the project before deletion everytime - // we detect it is outdated, so that we avoid by default - // breaking changes that can happen on the deployer - // side in the future. - if !project_deletable || version < current_version { - let handle = state - .service - .new_task() - .project(project_name.clone()) - .and_then(task::restart(project_id)) - .and_then(task::run_until_done()) - .send(&state.sender) - .await?; + let handle = state + .service + .new_task() + .project(project_name.clone()) + .and_then(task::destroy()) // This destroy might only recover the project from an errored state + .and_then(task::run_until_destroyed()) + .and_then(task::restart(project_id)) + .and_then(task::run_until_ready()) + .and_then(task::destroy()) + .and_then(task::run_until_destroyed()) + .and_then(task::restart(project_id)) + .and_then(task::run_until_ready()) + .send(&state.sender) + .await?; - // Wait for the project to be ready - handle.await; + // Wait for the project to be ready + handle.await; - let new_state = state.service.find_project_by_name(&project_name).await?; + let new_state = state.service.find_project_by_name(&project_name).await?; - if !new_state.state.is_ready() { - return Err(ProjectCorrupted.into()); - } + if !new_state.state.is_ready() { + warn!(state = ?new_state.state, "failed to restart project"); + return Err(ProjectCorrupted.into()); } let service = state.service.clone(); @@ -360,8 +341,10 @@ async fn delete_project( let project_caller = ProjectCaller::new(state.clone(), scoped_user.clone(), req.headers()).await?; + trace!("getting deployments"); // check that a deployment is not running let mut deployments = project_caller.get_deployment_list().await?; + debug!(?deployments, "got deployments"); deployments.sort_by_key(|d| d.last_update); // Make sure no deployment is in the building pipeline @@ -376,6 +359,7 @@ async fn delete_project( }); if has_bad_state { + warn!("has bad state"); return Err(ProjectHasBuildingDeployment.into()); } @@ -384,6 +368,7 @@ async fn delete_project( .filter(|d| d.state == deployment::State::Running); for running_deployment in running_deployments { + info!(%running_deployment, "stopping running deployment"); let res = project_caller .stop_deployment(&running_deployment.id) .await?; @@ -393,11 +378,13 @@ async fn delete_project( } } + trace!("getting resources"); // check if any resources exist let resources = project_caller.get_resources().await?; let mut delete_fails = Vec::new(); for resource in resources { + info!(?resource, "deleting resource"); let resource_type = resource.r#type.to_string(); let res = project_caller.delete_resource(&resource_type).await?; @@ -410,6 +397,7 @@ async fn delete_project( return Err(ProjectHasResources(delete_fails).into()); } + trace!("deleting container"); let task = service .new_task() .project(project_name.clone()) @@ -418,6 +406,7 @@ async fn delete_project( .await?; task.await; + trace!("removing project from state"); service.delete_project(&project_name).await?; Ok(AxumJson("project successfully deleted".to_owned())) @@ -692,7 +681,7 @@ async fn get_status( }; // Compute provisioner status. - let provisioner_status = if let Ok(channel) = service.provisioner_host().connect().await { + let provisioner_status = if let Ok(channel) = service.provisioner_uri().connect().await { let channel = ServiceBuilder::new().service(channel); let mut provisioner_client = ProvisionerClient::new(channel); if provisioner_client.health_check(Ping {}).await.is_ok() { @@ -1740,20 +1729,6 @@ pub mod tests { ); } - #[test_context(TestProject)] - #[tokio::test] - async fn api_delete_project_that_has_resources_but_fails_to_remove_them( - project: &mut TestProject, - ) { - project.deploy("../examples/axum/metadata").await; - project.stop_service().await; - - assert_eq!( - project.router_call(Method::DELETE, "/delete").await, - StatusCode::INTERNAL_SERVER_ERROR - ); - } - #[test_context(TestProject)] #[tokio::test] async fn api_delete_project_that_has_running_deployment(project: &mut TestProject) { @@ -1771,11 +1746,11 @@ pub mod tests { project.just_deploy("../examples/axum/hello-world").await; // Wait a bit to it to progress in the queue - sleep(Duration::from_secs(2)).await; + sleep(Duration::from_secs(10)).await; assert_eq!( project.router_call(Method::DELETE, "/delete").await, - StatusCode::BAD_REQUEST + StatusCode::OK ); } diff --git a/gateway/src/args.rs b/gateway/src/args.rs index 015c600ce..d8c2c1caf 100644 --- a/gateway/src/args.rs +++ b/gateway/src/args.rs @@ -60,8 +60,8 @@ pub struct ServiceArgs { pub prefix: String, /// The address at which an active runtime container will find /// the provisioner service - #[arg(long, default_value = "provisioner")] - pub provisioner_host: String, + #[arg(long, default_value = "http://provisioner:8000")] + pub provisioner_uri: String, /// Address to reach the authentication service at #[arg(long, default_value = "http://127.0.0.1:8008")] pub auth_uri: Uri, diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 7dfefd1e1..d9296f5aa 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -168,6 +168,7 @@ pub mod tests { use ring::signature::{self, Ed25519KeyPair, KeyPair}; use shuttle_backends::auth::ConvertResponse; use shuttle_backends::test_utils::gateway::PermissionsMock; + use shuttle_backends::test_utils::provisioner::get_mocked_provisioner; use shuttle_backends::test_utils::resource_recorder::get_mocked_resource_recorder; use shuttle_common::claims::{AccountTier, Claim}; use shuttle_common::models::deployment::DeploymentRequest; @@ -399,6 +400,7 @@ pub mod tests { let auth: SocketAddr = format!("0.0.0.0:{auth_port}").parse().unwrap(); let auth_uri: Uri = format!("http://{auth}").parse().unwrap(); let resource_recorder_port = get_mocked_resource_recorder().await; + let provisioner_port = get_mocked_provisioner().await; let auth_service = AuthService::new(auth); auth_service @@ -418,8 +420,6 @@ pub mod tests { let network_name = env::var("SHUTTLE_TESTS_NETWORK").unwrap_or_else(|_| "shuttle_default".to_string()); - let provisioner_host = "provisioner".to_string(); - let docker_host = "/var/run/docker.sock".to_string(); let args = StartArgs { @@ -432,9 +432,9 @@ pub mod tests { docker_host, image, prefix, - provisioner_host, + provisioner_uri: format!("http://host.docker.internal:{provisioner_port}"), // The started containers need to reach auth on the host. - // For this to work, the firewall should not be blocking traffic on the `SHUTTLE_TEST_NETWORK` interface. + // For this to work, the firewall should not be blocking traffic on the `SHUTTLE_TESTS_NETWORK` interface. // The following command can be used on NixOs to allow traffic on the interface. // ``` // sudo iptables -I nixos-fw -i -j nixos-fw-accept diff --git a/gateway/src/project.rs b/gateway/src/project.rs index e46cfadb2..b553e5310 100644 --- a/gateway/src/project.rs +++ b/gateway/src/project.rs @@ -860,7 +860,7 @@ impl ProjectCreating { let ContainerSettings { image: default_image, prefix, - provisioner_host, + provisioner_uri, auth_uri, resource_recorder_uri, extra_hosts, @@ -902,7 +902,7 @@ impl ProjectCreating { "--api-address", format!("0.0.0.0:{RUNTIME_API_PORT}"), "--provisioner-address", - format!("http://{provisioner_host}:8000"), + provisioner_uri, "--artifacts-path", "/opt/shuttle", "--state", @@ -1732,7 +1732,7 @@ where #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct ProjectDestroyed { - destroyed: Option, + pub destroyed: Option, } #[async_trait] diff --git a/gateway/src/service.rs b/gateway/src/service.rs index 7d6ecfd1b..e60ce9f73 100644 --- a/gateway/src/service.rs +++ b/gateway/src/service.rs @@ -145,7 +145,7 @@ impl From for ApiError { pub struct ContainerSettingsBuilder { prefix: Option, image: Option, - provisioner: Option, + provisioner_uri: Option, auth_uri: Option, resource_recorder_uri: Option, network_name: Option, @@ -162,7 +162,7 @@ impl ContainerSettingsBuilder { let ServiceArgs { prefix, network_name, - provisioner_host, + provisioner_uri, auth_uri, resource_recorder_uri, image, @@ -172,7 +172,7 @@ impl ContainerSettingsBuilder { } = args; self.prefix(prefix) .image(image) - .provisioner_host(provisioner_host) + .provisioner_uri(provisioner_uri) .auth_uri(auth_uri) .resource_recorder_uri(resource_recorder_uri) .network_name(network_name) @@ -192,8 +192,8 @@ impl ContainerSettingsBuilder { self } - pub fn provisioner_host(mut self, host: S) -> Self { - self.provisioner = Some(host.to_string()); + pub fn provisioner_uri(mut self, provisioner_uri: S) -> Self { + self.provisioner_uri = Some(provisioner_uri.to_string()); self } @@ -225,7 +225,7 @@ impl ContainerSettingsBuilder { pub async fn build(mut self) -> ContainerSettings { let prefix = self.prefix.take().unwrap(); let image = self.image.take().unwrap(); - let provisioner_host = self.provisioner.take().unwrap(); + let provisioner_uri = self.provisioner_uri.take().unwrap(); let auth_uri = self.auth_uri.take().unwrap(); let resource_recorder_uri = self.resource_recorder_uri.take().unwrap(); let extra_hosts = self.extra_hosts.take().unwrap(); @@ -236,7 +236,7 @@ impl ContainerSettingsBuilder { ContainerSettings { prefix, image, - provisioner_host, + provisioner_uri, auth_uri, resource_recorder_uri, network_name, @@ -250,7 +250,7 @@ impl ContainerSettingsBuilder { pub struct ContainerSettings { pub prefix: String, pub image: String, - pub provisioner_host: String, + pub provisioner_uri: String, pub auth_uri: String, pub resource_recorder_uri: String, pub network_name: String, @@ -279,7 +279,7 @@ pub struct GatewayService { hard_container_limit: u32, // We store these because we'll need them for the health checks - provisioner_host: Endpoint, + provisioner_uri: Endpoint, auth_host: Uri, } @@ -342,7 +342,7 @@ impl GatewayService { task_router, state_dir, permit_client, - provisioner_host: Endpoint::new(format!("http://{}:8000", args.provisioner_host)) + provisioner_uri: Endpoint::new(args.provisioner_uri) .expect("to have a valid provisioner endpoint"), auth_host: args.auth_uri, cch_container_limit: args.cch_container_limit, @@ -1056,8 +1056,8 @@ impl GatewayService { .expect("Can not parse admin credentials from path") } - pub fn provisioner_host(&self) -> &Endpoint { - &self.provisioner_host + pub fn provisioner_uri(&self) -> &Endpoint { + &self.provisioner_uri } pub fn auth_uri(&self) -> &Uri { &self.auth_host diff --git a/gateway/src/task.rs b/gateway/src/task.rs index 9c42f87b7..030bcb1f1 100644 --- a/gateway/src/task.rs +++ b/gateway/src/task.rs @@ -168,6 +168,28 @@ pub fn restart(project_id: Ulid) -> impl Task }) } +pub fn run_until_ready() -> impl Task { + run(|ctx| async move { + match ctx.state { + Project::Ready(_) | Project::Errored(_) => TaskResult::Done(ctx.state), + _ => TaskResult::Pending(ctx.state.next(&ctx.gateway).await.unwrap()), + } + }) +} + +pub fn run_until_destroyed() -> impl Task { + run(|ctx| async move { + match ctx.state { + Project::Errored(_) => TaskResult::Done(ctx.state), + Project::Destroyed(_) => { + // Set `destroyed` to None to prevent starting up the container with the old key + TaskResult::Done(Project::Destroyed(ProjectDestroyed { destroyed: None })) + } + _ => TaskResult::Pending(ctx.state.next(&ctx.gateway).await.unwrap()), + } + }) +} + pub fn start_idle_deploys() -> impl Task { run(|ctx| async move { match ctx.state {