-
Notifications
You must be signed in to change notification settings - Fork 825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Test in place upgrades run tests #3991
Changes from all commits
acc2a9d
04000c0
eb34600
afbc1c6
00bc3eb
cc8e2b2
20a1f4d
6d7c728
fd9a886
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
*.iml | ||
bin | ||
*.o | ||
*.sln | ||
tmp | ||
terraform.tfvars | ||
terraform.tfstate* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,14 +24,17 @@ import ( | |
"os" | ||
"os/exec" | ||
"regexp" | ||
"strconv" | ||
"strings" | ||
"time" | ||
|
||
v1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/util/wait" | ||
"k8s.io/client-go/informers" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/rest" | ||
"k8s.io/client-go/tools/cache" | ||
) | ||
|
||
const ( | ||
|
@@ -50,13 +53,11 @@ const ( | |
// AgonesRegistry is the public registry for Agones releases | ||
AgonesRegistry = "us-docker.pkg.dev/agones-images/release" | ||
// TestRegistry is the public registry for upgrade test container files | ||
// TODO: Create Test Registry in agones-images/ci | ||
TestRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test" | ||
) | ||
|
||
var ( | ||
// Dev is the current development version of Agones | ||
// TODO: Get the build version of dev (i.e. 1.44.0-dev-b765f49) | ||
Dev = os.Getenv("Dev") | ||
// ReleaseVersion is the latest released version of Agones (DEV - 1). | ||
ReleaseVersion = os.Getenv("ReleaseVersion") | ||
|
@@ -70,10 +71,21 @@ var ( | |
|
||
func main() { | ||
ctx := context.Background() | ||
cfg, err := rest.InClusterConfig() | ||
if err != nil { | ||
log.Fatal("Could not create in cluster config", cfg) | ||
} | ||
|
||
validConfigs := configTestSetup(ctx) | ||
kubeClient, err := kubernetes.NewForConfig(cfg) | ||
if err != nil { | ||
log.Fatal("Could not create the kubernetes api clientset", err) | ||
} | ||
|
||
validConfigs := configTestSetup(ctx, kubeClient) | ||
go watchGameServerPods(kubeClient, make(chan struct{}), make(map[string]podLog), len(validConfigs)*2) | ||
addAgonesRepo() | ||
runConfigWalker(ctx, validConfigs) | ||
cleanUpResources() | ||
} | ||
|
||
type versionMappings struct { | ||
|
@@ -92,9 +104,16 @@ type configTest struct { | |
gameServerPath string | ||
} | ||
|
||
// CountsAndLists can be removed from the template once CountsAndLists is GA in all tested versions | ||
type gameServerTemplate struct { | ||
AgonesVersion string | ||
Registry string | ||
AgonesVersion string | ||
Registry string | ||
CountsAndLists bool | ||
} | ||
|
||
type podLog struct { | ||
SdkVersion string | ||
GameServerVersion string | ||
} | ||
|
||
type helmStatuses []struct { | ||
|
@@ -108,11 +127,11 @@ type helmStatuses []struct { | |
} | ||
|
||
// Determine test scenario to run | ||
func configTestSetup(ctx context.Context) []*configTest { | ||
func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*configTest { | ||
versionMap := versionMappings{} | ||
|
||
// Find the Kubernetes version of the node that this test is running on. | ||
k8sVersion := findK8sVersion(ctx) | ||
k8sVersion := findK8sVersion(ctx, kubeClient) | ||
|
||
// Get the mappings of valid Kubernetes, Agones, and Feature Gate versions from the configmap. | ||
err := json.Unmarshal([]byte(VersionMappings), &versionMap) | ||
|
@@ -124,38 +143,48 @@ func configTestSetup(ctx context.Context) []*configTest { | |
configTests := []*configTest{} | ||
for _, agonesVersion := range versionMap.K8sToAgonesVersions[k8sVersion] { | ||
ct := configTest{} | ||
// TODO: create different valid config based off of available feature gates. | ||
// containsCountsAndLists will need to be updated to return true for when CountsAndLists=true. | ||
countsAndLists := containsCountsAndLists(agonesVersion) | ||
ct.agonesVersion = agonesVersion | ||
if agonesVersion == "DEV" { | ||
if agonesVersion == "Dev" { | ||
ct.agonesVersion = Dev | ||
// Game server container cannot be created at DEV version due to go.mod only able to access | ||
// published Agones versions. Use N-1 for DEV. | ||
ct.gameServerPath = createGameServerFile(ReleaseVersion) | ||
ct.gameServerPath = createGameServerFile(ReleaseVersion, countsAndLists) | ||
} else { | ||
ct.gameServerPath = createGameServerFile(agonesVersion) | ||
ct.gameServerPath = createGameServerFile(agonesVersion, countsAndLists) | ||
} | ||
// TODO: create different valid config based off of available feature gates | ||
configTests = append(configTests, &ct) | ||
} | ||
|
||
return configTests | ||
} | ||
|
||
// Finds the Kubernetes version of the Kubelet on the node that the current pod is running on. | ||
// The Kubelet version is the same version as the node. | ||
func findK8sVersion(ctx context.Context) string { | ||
cfg, err := rest.InClusterConfig() | ||
if err != nil { | ||
log.Fatal("Could not create in cluster config", cfg) | ||
// containsCountsAndLists returns true if the agonesVersion >= 1.41.0 when the CountsAndLists | ||
// feature entered Beta (on by default) | ||
func containsCountsAndLists(agonesVersion string) bool { | ||
if agonesVersion == "Dev" { | ||
return true | ||
} | ||
|
||
kubeClient, err := kubernetes.NewForConfig(cfg) | ||
r := regexp.MustCompile(`\d+\.\d+`) | ||
strVersion := r.FindString(agonesVersion) | ||
floatVersion, err := strconv.ParseFloat(strVersion, 64) | ||
if err != nil { | ||
log.Fatal("Could not create the kubernetes api clientset", err) | ||
log.Fatalf("Could not convert agonesVersion %s to float: %s", agonesVersion, err) | ||
} | ||
if floatVersion > 1.40 { | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
// Finds the Kubernetes version of the Kubelet on the node that the current pod is running on. | ||
// The Kubelet version is the same version as the node. | ||
func findK8sVersion(ctx context.Context, kubeClient *kubernetes.Clientset) string { | ||
// Wait to get pod and node as these may take a while to start on a new Autopilot cluster. | ||
var pod *v1.Pod | ||
err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 7*time.Minute, true, func(ctx context.Context) (done bool, err error) { | ||
err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 7*time.Minute, true, func(ctx context.Context) (done bool, err error) { | ||
pod, err = kubeClient.CoreV1().Pods(PodNamespace).Get(ctx, PodName, metav1.GetOptions{}) | ||
if err != nil { | ||
return false, nil | ||
|
@@ -251,6 +280,8 @@ func installAgonesRelease(version, registry, featureGates, imagePullPolicy, side | |
} | ||
|
||
func runConfigWalker(ctx context.Context, validConfigs []*configTest) { | ||
cancelCtx, cancel := context.WithCancel(ctx) | ||
|
||
for _, config := range validConfigs { | ||
registry := AgonesRegistry | ||
chart := HelmChart | ||
|
@@ -277,7 +308,14 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) { | |
log.Fatalf("PollUntilContextTimeout timed out while attempting upgrade to Agones version %s. Helm Status %s", | ||
config.agonesVersion, helmStatus) | ||
} | ||
|
||
go createGameServers(cancelCtx, config.gameServerPath) | ||
// Allow some soak time at the Agones version before next upgrade | ||
time.Sleep(1 * time.Minute) | ||
} | ||
cancel() | ||
// TODO: Replace sleep with wait for the existing healthy Game Servers finish naturally by reaching their shutdown phase. | ||
time.Sleep(30 * time.Second) | ||
} | ||
|
||
// checkHelmStatus returns the status of the Helm release at a specified agonesVersion if it exists. | ||
|
@@ -304,8 +342,8 @@ func checkHelmStatus(agonesVersion string) string { | |
|
||
// Creates a gameserver yaml file from the mounted gameserver.yaml template. The name of the new | ||
// gameserver yaml is based on the Agones version, i.e. gs1440.yaml for Agones version 1.44.0 | ||
func createGameServerFile(agonesVersion string) string { | ||
gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion} | ||
func createGameServerFile(agonesVersion string, countsAndLists bool) string { | ||
gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists} | ||
|
||
gsTemplate, err := template.ParseFiles("gameserver.yaml") | ||
if err != nil { | ||
|
@@ -337,3 +375,96 @@ func createGameServerFile(agonesVersion string) string { | |
|
||
return gsPath | ||
} | ||
|
||
// Create a game server every five seconds until the context is cancelled. The game server container | ||
// be the same binary version as the game server file. The SDK version is always the same as the | ||
// version of the Agones controller that created it. The Game Server shuts itself down after the | ||
// tests have run as part of the `sdk-client-test` logic. | ||
func createGameServers(ctx context.Context, gsPath string) { | ||
args := []string{"create", "-f", gsPath} | ||
ticker := time.NewTicker(5 * time.Second) | ||
|
||
for { | ||
select { | ||
case <-ctx.Done(): | ||
ticker.Stop() | ||
return | ||
case <-ticker.C: | ||
_, err := runExecCommand(KubectlCmd, args...) | ||
// TODO: Do not ignore error if unable to create due to something other than cluster scale up | ||
if err != nil { | ||
log.Printf("Could not create Gameserver %s: %s", gsPath, err) | ||
} | ||
} | ||
} | ||
} | ||
|
||
// watchGameServerPods watches all game server pods for CrashLoopBackOff. Errors if the number of | ||
// CrashLoopBackOff backoff pods exceeds the number of acceptedFailures. | ||
func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, failedPods map[string]podLog, acceptedFailures int) { | ||
// Filter by label agones.dev/role=gameserver to only game server pods | ||
labelOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) { | ||
opts.LabelSelector = "agones.dev/role=gameserver" | ||
}) | ||
kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second, | ||
informers.WithNamespace("default"), labelOptions) | ||
podInformer := kubeInformerFactory.Core().V1().Pods().Informer() | ||
|
||
_, err := podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||
UpdateFunc: func(_, newObj interface{}) { | ||
newPod := newObj.(*v1.Pod) | ||
for _, cs := range newPod.Status.ContainerStatuses { | ||
if cs.Name != "sdk-client-test" || cs.State.Waiting == nil || cs.State.Waiting.Reason != "CrashLoopBackOff" { | ||
continue | ||
} | ||
gsVersion := newPod.Labels["agonesVersion"] | ||
sdkVersion := newPod.Annotations["agones.dev/sdk-version"] | ||
log.Printf("%s for pod: %s with game server binary version %s, and SDK version %s", cs.State.Waiting.Reason, newPod.Name, gsVersion, sdkVersion) | ||
// Put failed pods into the map until it reaches capacity. | ||
failedPods[newPod.Name] = podLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion} | ||
if len(failedPods) > acceptedFailures { | ||
log.Fatalf("Too many Game Server pods in CrashLoopBackOff: %v", failedPods) | ||
} | ||
} | ||
}, | ||
}) | ||
if err != nil { | ||
log.Fatal("Not able to create AddEventHandler", err) | ||
} | ||
|
||
go podInformer.Run(stopCh) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still don't quite get the usage of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a required paramenter for the .Run() function. I think it's used instead of context.Context to stop the informer anywhere in the code. We don't use it since we want the informer to run until the main thread completes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see, I thought it's optional :) |
||
if !cache.WaitForCacheSync(stopCh, podInformer.HasSynced) { | ||
log.Fatal("Timed out waiting for caches to sync") | ||
} | ||
} | ||
|
||
// Deletes any remaining Game Servers, Uninstalls Agones, and Deletes agones-system namespace. | ||
func cleanUpResources() { | ||
args := []string{"delete", "gs", "-l", "app=sdk-client-test"} | ||
_, err := runExecCommand(KubectlCmd, args...) | ||
if err != nil { | ||
log.Println("Could not delete game servers", err) | ||
} | ||
|
||
args = []string{"uninstall", "agones", "-n", "agones-system"} | ||
_, err = runExecCommand(HelmCmd, args...) | ||
if err != nil { | ||
log.Println("Could not Helm uninstall Agones", err) | ||
} | ||
|
||
// Apiservice v1.allocation.agones.dev, which is part of Service agones-system/agones-controller-service, | ||
// does not always get cleaned up on Helm uninstall, and needs to be deleted (if it exists) before | ||
// the agones-system namespace can be removed. | ||
// Ignore the error, because an "error" means Helm already uninstall the apiservice. | ||
args = []string{"delete", "apiservice", "v1.allocation.agones.dev"} | ||
out, err := runExecCommand(KubectlCmd, args...) | ||
if err == nil { | ||
fmt.Println(string(out)) | ||
} | ||
|
||
args = []string{"delete", "ns", "agones-system"} | ||
_, err = runExecCommand(KubectlCmd, args...) | ||
if err != nil { | ||
log.Println("Could not delete agones-system namespace", err) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How is this value decided? Just curious whether make it configurable can make it easy to use.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From manually testing different values. 5 seconds led to more unhealthy game servers, and same for 10 seconds. It would be nice to have it configurable, but that might be hard to do since the
sdk-client-test
binary is built at a release version (1.38, 1.39, etc.), and not at run time.