-
Notifications
You must be signed in to change notification settings - Fork 91
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DNM] Fixes for podman v4 #1573
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
getRandomPort, | ||
getSha256, | ||
validateImageUrl, | ||
} from "@zombienet/utils"; | ||
Check warning on line 9 in javascript/packages/orchestrator/src/configGenerator.ts GitHub Actions / build (18.x)
Check warning on line 9 in javascript/packages/orchestrator/src/configGenerator.ts GitHub Actions / build (19.x)
|
||
import { | ||
ARGS_TO_REMOVE, | ||
DEFAULT_ADDER_COLLATOR_BIN, | ||
|
@@ -41,6 +41,7 @@ | |
Node, | ||
ZombieRole, | ||
} from "./sharedTypes"; | ||
import { generateNamespace } from "@zombienet/utils"; | ||
Check warning on line 44 in javascript/packages/orchestrator/src/configGenerator.ts GitHub Actions / build (18.x)
Check warning on line 44 in javascript/packages/orchestrator/src/configGenerator.ts GitHub Actions / build (19.x)
|
||
|
||
const debug = require("debug")("zombie::config-manager"); | ||
|
||
|
@@ -496,21 +497,20 @@ | |
} | ||
|
||
interface UsedNames { | ||
[properyName: string]: number; | ||
[properyName: string]: boolean; | ||
} | ||
|
||
const mUsedNames: UsedNames = {}; | ||
|
||
export function getUniqueName(name: string): string { | ||
let uniqueName; | ||
if (!mUsedNames[name]) { | ||
mUsedNames[name] = 1; | ||
uniqueName = name; | ||
} else { | ||
uniqueName = `${name}-${mUsedNames[name]}`; | ||
mUsedNames[name] += 1; | ||
let uniqueName = `${name}-${generateNamespace()}`; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was causing naming collisions right? There is another issue with the cleanup but using some random here sounds good :) |
||
|
||
Check failure on line 507 in javascript/packages/orchestrator/src/configGenerator.ts GitHub Actions / build (18.x)
Check failure on line 507 in javascript/packages/orchestrator/src/configGenerator.ts GitHub Actions / build (19.x)
|
||
while (mUsedNames[uniqueName]) { | ||
uniqueName = `${name}-${generateNamespace()}`; | ||
} | ||
|
||
mUsedNames[uniqueName] = true; | ||
|
||
return uniqueName; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,6 +56,7 @@ const TMP_DONE = "echo done > /tmp/zombie-tmp-done"; | |
const TRANSFER_CONTAINER_WAIT_LOG = "waiting for tar to finish"; | ||
const NODE_CONTAINER_WAIT_LOG = "waiting for copy files to finish"; | ||
const WAIT_UNTIL_SCRIPT_SUFIX = `until [ -f ${FINISH_MAGIC_FILE} ]; do echo ${NODE_CONTAINER_WAIT_LOG}; sleep 1; done; echo copy files has finished`; | ||
const PODMAN_WAIT_UNTIL_SCRIPT_SUFIX = `sleep 5`; // FIXME | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was surprised to find out that putting the magic file is a no-op for podman provider. I'm not sure how it was supposed to even work, so just put a long enough delay for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think there was a change on how the |
||
const K8S_WAIT_UNTIL_SCRIPT_SUFIX = `until [ -f ${FINISH_MAGIC_FILE} ]; do /cfg/coreutils echo "${NODE_CONTAINER_WAIT_LOG}"; /cfg/coreutils sleep 1; done; /cfg/coreutils echo "copy files has finished"`; | ||
const TRANSFER_CONTAINER_NAME = "transfer-files-container"; | ||
const ZOMBIE_BUCKET = "zombienet-logs"; | ||
|
@@ -171,6 +172,7 @@ export { | |
METRICS_URI_PATTERN, | ||
NODE_CONTAINER_WAIT_LOG, | ||
P2P_PORT, | ||
PODMAN_WAIT_UNTIL_SCRIPT_SUFIX, | ||
PROMETHEUS_PORT, | ||
REGULAR_BIN_PATH, | ||
RPC_HTTP_PORT, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,6 +56,7 @@ | |
remoteDir: string; | ||
dataDir: string; | ||
isTearingDown: boolean; | ||
podSuffix?: string; | ||
|
||
constructor(configPath: string, namespace: string, tmpDir: string) { | ||
super(configPath, namespace, tmpDir, "podman", "podman"); | ||
|
@@ -215,7 +216,7 @@ | |
): Promise<string> { | ||
const args = ["logs"]; | ||
if (since && since > 0) args.push(...["--since", `${since}s`]); | ||
args.push(`${podName}_pod-${podName}`); | ||
args.push(`${podName}${this.podSuffix}-${podName}`); | ||
|
||
const result = await this.runCommand(args, { scoped: false }); | ||
return result.stdout; | ||
|
@@ -240,7 +241,7 @@ | |
} | ||
|
||
async getPortMapping(port: number, podName: string): Promise<number> { | ||
const args = ["inspect", `${podName}_pod-${podName}`, "--format", "json"]; | ||
const args = ["inspect", `${podName}${this.podSuffix}-${podName}`, "--format", "json"]; | ||
Check failure on line 244 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (18.x)
Check failure on line 244 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (19.x)
Check failure on line 244 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (20.x)
|
||
const result = await this.runCommand(args, { scoped: false }); | ||
const resultJson = JSON.parse(result.stdout); | ||
const hostPort = | ||
|
@@ -249,7 +250,7 @@ | |
} | ||
|
||
async getNodeIP(podName: string): Promise<string> { | ||
const args = ["inspect", `${podName}_pod-${podName}`, "--format", "json"]; | ||
const args = ["inspect", `${podName}${this.podSuffix}-${podName}`, "--format", "json"]; | ||
Check failure on line 253 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (18.x)
Check failure on line 253 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (19.x)
Check failure on line 253 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (20.x)
|
||
const result = await this.runCommand(args, { scoped: false }); | ||
const resultJson = JSON.parse(result.stdout); | ||
const podIp = | ||
|
@@ -353,6 +354,7 @@ | |
async spawnFromDef( | ||
podDef: any, | ||
filesToCopy: fileMap[] = [], | ||
waitExit: boolean = false, | ||
keystore?: string, | ||
chainSpecId?: string, | ||
dbSnapshot?: string, | ||
|
@@ -432,7 +434,7 @@ | |
|
||
await this.createResource(podDef, false, false); | ||
|
||
await this.wait_pod_ready(name); | ||
await this.wait_pod(name, waitExit ? ["Exited"] : ["Running"]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Race conditions are a big problem for podman. If we're spinning up a node, we want to wait until the pod is running. But if we're generating a chainspec, we need to wait until the pod has finished its job. Every second time the chainspec was generated, I ended up with an empty chainspec in the network directory because we only waited for the pod to start, and we tried to copy the chainspec at once, but it was not ready yet. The pod started writing it but didn't finish, so the empty file is copied. To address that, I added an explicit waiting condition, and now races are gone (mostly; see other comments). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we can reuse the same approach from |
||
await this.addNodeToPrometheus(name); | ||
|
||
logTable = new CreateLogTable({ | ||
|
@@ -449,6 +451,15 @@ | |
localFilePath: string, | ||
): Promise<void> { | ||
debug(`cp ${this.tmpDir}/${identifier}${podFilePath} ${localFilePath}`); | ||
while (true) { | ||
Check failure on line 454 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (18.x)
Check failure on line 454 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (19.x)
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To copy a file we need that file to be ready; it was not always the case. Sometimes, it's just not there yet, and sometimes, the pod starts writing something, but the file is still zero-length when we try to copy it. I don't remember the exact reason why in this very case I couldn't use |
||
try { | ||
let stat = await fs.stat(`${this.tmpDir}/${identifier}${podFilePath}`); | ||
Check failure on line 456 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (18.x)
Check failure on line 456 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (19.x)
|
||
if (typeof stat !== "undefined" && stat.size > 0) { | ||
break; | ||
} | ||
} catch {} | ||
Check failure on line 460 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (18.x)
Check failure on line 460 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (19.x)
|
||
await new Promise(r => setTimeout(r, 1000)); | ||
Check failure on line 461 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (18.x)
Check failure on line 461 in javascript/packages/orchestrator/src/providers/podman/podmanClient.ts GitHub Actions / build (19.x)
|
||
} | ||
await fs.copyFile( | ||
`${this.tmpDir}/${identifier}${podFilePath}`, | ||
localFilePath, | ||
|
@@ -476,24 +487,31 @@ | |
{ scoped: false }, | ||
); | ||
|
||
if (waitReady) await this.wait_pod_ready(name); | ||
if (waitReady) await this.wait_pod(name); | ||
|
||
if (typeof this.podSuffix === "undefined") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. podman v3 and podman v4 resource naming schemes are different. Here, I try to guess which one is used (because I believe the naming changed not exactly in v4.0.0, but in v4.something). Beware: this wasn't tested with podman v3. I only made sure it works for v4. |
||
const args = ["inspect", `${name}-${name}`, "--format", "json"]; | ||
const result = await this.runCommand(args, { scoped: false }); | ||
const resultJson = JSON.parse(result.stdout); | ||
this.podSuffix = typeof resultJson[0] === "undefined" ? "_pod" : ""; | ||
debug(`Determined pod suffix [${this.podSuffix}]`); | ||
} | ||
} | ||
|
||
async wait_pod_ready(podName: string, allowDegraded = true): Promise<void> { | ||
async wait_pod(podName: string, status: [string] = ["Running"]): Promise<void> { | ||
// loop until ready | ||
let t = this.timeout; | ||
const args = ["pod", "ps", "-f", `name=${podName}`, "--format", "json"]; | ||
do { | ||
const result = await this.runCommand(args, { scoped: false }); | ||
const resultJson = JSON.parse(result.stdout); | ||
if (resultJson[0].Status === "Running") return; | ||
if (allowDegraded && resultJson[0].Status === "Degraded") return; | ||
if (status.includes(resultJson[0].Status)) return; | ||
|
||
await new Promise((resolve) => setTimeout(resolve, 3000)); | ||
t -= 3; | ||
} while (t > 0); | ||
|
||
throw new Error(`Timeout(${this.timeout}) for pod : ${podName}`); | ||
throw new Error(`Timeout(${this.timeout}) waiting ${status} for pod : ${podName}`); | ||
} | ||
|
||
async isPodMonitorAvailable(): Promise<boolean> { | ||
|
@@ -504,7 +522,7 @@ | |
getPauseArgs(name: string): string[] { | ||
return [ | ||
"exec", | ||
`${name}_pod-${name}`, | ||
`${name}${this.podSuffix}-${name}`, | ||
"bash", | ||
"-c", | ||
"echo pause > /tmp/zombiepipe", | ||
|
@@ -513,15 +531,15 @@ | |
getResumeArgs(name: string): string[] { | ||
return [ | ||
"exec", | ||
`${name}_pod-${name}`, | ||
`${name}${this.podSuffix}-${name}`, | ||
"bash", | ||
"-c", | ||
"echo resume > /tmp/zombiepipe", | ||
]; | ||
} | ||
|
||
async restartNode(name: string, timeout: number | null): Promise<boolean> { | ||
const args = ["exec", `${name}_pod-${name}`, "bash", "-c"]; | ||
const args = ["exec", `${name}${this.podSuffix}-${name}`, "bash", "-c"]; | ||
const cmd = timeout | ||
? `echo restart ${timeout} > /tmp/zombiepipe` | ||
: `echo restart > /tmp/zombiepipe`; | ||
|
@@ -537,7 +555,7 @@ | |
} | ||
|
||
getLogsCommand(name: string): string { | ||
return `podman logs -f ${name}_pod-${name}`; | ||
return `podman logs -f ${name}${this.podSuffix}-${name}`; | ||
} | ||
|
||
// NOOP | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,7 @@ export interface ContainerPort { | |
export interface Container { | ||
image: string; | ||
name: string; | ||
imagePullPolicy: "Always"; | ||
imagePullPolicy: string; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Forcing this to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this should be configurable through the network config and use |
||
volumeMounts: VolumeMount[]; | ||
ports: ContainerPort[]; | ||
command?: string[]; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
podman's
--filter
option expects a regular expression rather than an exact matching string. Thus, doingpodman pod ps -f name=temp
will return an array of statuses for every pod which containstemp
in its name, so you'll gettemp
,temp-2
, and eventempo
, and just checking the first element of that array doesn't guarantee you see the info for that very pod you're looking for.My naive attempt to wrap it like
name='^temp$'
didn't succeed for some reason, so I ended up just assigning unique names to nodes. This should definitely be improved.