-
Notifications
You must be signed in to change notification settings - Fork 1.8k
sharedstorage support remote umount and fix bug #3456
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -180,7 +180,7 @@ export class RemoteEnvironmentService extends EnvironmentService { | |
} else { | ||
environment.setStatus('FAILED'); | ||
} | ||
this.releaseEnvironmentResource(environment); | ||
await this.releaseEnvironmentResource(environment); | ||
} | ||
} | ||
} | ||
|
@@ -193,7 +193,16 @@ export class RemoteEnvironmentService extends EnvironmentService { | |
* If a environment is finished, release the connection resource | ||
* @param environment remote machine environment job detail | ||
*/ | ||
private releaseEnvironmentResource(environment: EnvironmentInformation): void { | ||
private async releaseEnvironmentResource(environment: EnvironmentInformation): Promise<void> { | ||
if (environment.useSharedStorage) { | ||
const executor = await this.getExecutor(environment.id); | ||
const remoteUmountCommand = component.get<SharedStorageService>(SharedStorageService).remoteUmountCommand; | ||
const result = await executor.executeScript(remoteUmountCommand, false, false); | ||
if (result.exitCode !== 0) { | ||
this.log.error(`Umount shared storage on remote machine failed.\n ERROR: ${result.stderr}`); | ||
} | ||
} | ||
|
||
const executorManager = this.environmentExecutorManagerMap.get(environment.id); | ||
if (executorManager === undefined) { | ||
throw new Error(`ExecutorManager is not assigned for environment ${environment.id}`); | ||
|
@@ -248,17 +257,20 @@ export class RemoteEnvironmentService extends EnvironmentService { | |
} | ||
this.environmentExecutorManagerMap.set(environment.id, executorManager); | ||
const executor = await this.getExecutor(environment.id); | ||
let remoteWorkingRoot: string; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remoteWorkingRoot -> remoteWorkingRootDir There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. modify it |
||
if (environment.useSharedStorage) { | ||
const environmentRoot = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot; | ||
environment.runnerWorkingFolder = executor.joinPath(environmentRoot, 'envs', environment.id) | ||
const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand; | ||
await executor.executeScript(remoteMountCommand, false, false); | ||
remoteWorkingRoot = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot; | ||
environment.runnerWorkingFolder = executor.joinPath(remoteWorkingRoot, 'envs', environment.id); | ||
const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand.replace(/echo -e /g, `echo `).replace(/echo /g, `echo -e `); | ||
const result = await executor.executeScript(remoteMountCommand, false, false); | ||
if (result.exitCode !== 0) { | ||
throw new Error(`Mount shared storage on remote machine failed.\n ERROR: ${result.stderr}`); | ||
} | ||
} else { | ||
environment.runnerWorkingFolder = | ||
executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), | ||
'envs', environment.id) | ||
remoteWorkingRoot = executor.getRemoteExperimentRootDir(getExperimentId()); | ||
environment.runnerWorkingFolder = executor.joinPath(remoteWorkingRoot, 'envs', environment.id); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. duplicated with line 263 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix it |
||
} | ||
environment.command = `cd ${environment.runnerWorkingFolder} && \ | ||
environment.command = `cd ${remoteWorkingRoot} && \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why change to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. command also contains |
||
${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ | ||
1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ | ||
&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; | ||
|
@@ -305,14 +317,14 @@ export class RemoteEnvironmentService extends EnvironmentService { | |
|
||
if (environment.status === 'UNKNOWN') { | ||
environment.status = 'USER_CANCELED'; | ||
this.releaseEnvironmentResource(environment); | ||
await this.releaseEnvironmentResource(environment); | ||
return | ||
} | ||
|
||
const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`; | ||
try { | ||
await executor.killChildProcesses(jobpidPath); | ||
this.releaseEnvironmentResource(environment); | ||
await this.releaseEnvironmentResource(environment); | ||
} catch (error) { | ||
this.log.error(`stopEnvironment: ${error}`); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,6 +79,7 @@ export class AzureBlobSharedStorageService extends SharedStorageService { | |
private log: Logger; | ||
private internalStorageService: MountedStorageService; | ||
private experimentId: string; | ||
private localMounted?: string; | ||
|
||
private storageType?: SharedStorageType; | ||
private storageAccountName?: string; | ||
|
@@ -113,10 +114,10 @@ export class AzureBlobSharedStorageService extends SharedStorageService { | |
this.log.error(errorMessage); | ||
return Promise.reject(errorMessage); | ||
} | ||
|
||
if (azureblobConfig.localMounted === 'nnimount') { | ||
this.localMounted = azureblobConfig.localMounted; | ||
if (this.localMounted === 'nnimount') { | ||
await this.helpLocalMount(); | ||
} else if (azureblobConfig.localMounted === 'nomount') { | ||
} else if (this.localMounted === 'nomount') { | ||
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount' yet.`; | ||
this.log.error(errorMessage); | ||
return Promise.reject(errorMessage); | ||
|
@@ -154,6 +155,15 @@ export class AzureBlobSharedStorageService extends SharedStorageService { | |
} | ||
} | ||
|
||
public get remoteUmountCommand(): string { | ||
if (this.remoteMountPoint) { | ||
return `sudo umount -l ${this.remoteMountPoint}`; | ||
} else { | ||
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`); | ||
return ''; | ||
} | ||
} | ||
|
||
private getCommand(mountPoint: string): string { | ||
const install = `rm -f nni_install_fuseblob.sh && touch nni_install_fuseblob.sh && echo "${INSTALL_BLOBFUSE.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_fuseblob.sh && bash nni_install_fuseblob.sh`; | ||
const prepare = `sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName ${this.storageAccountName}\\naccountKey ${this.storageAccountKey}\\ncontainerName ${this.containerName}" >> nni_fuse_connection.cfg`; | ||
|
@@ -206,4 +216,21 @@ export class AzureBlobSharedStorageService extends SharedStorageService { | |
return Promise.reject(errorMessage); | ||
} | ||
} | ||
|
||
public async cleanUp(): Promise<void> { | ||
if (this.localMounted !== 'nnimount') { | ||
return Promise.resolve(); | ||
} | ||
try { | ||
const result = await cpp.exec(`sudo umount -l ${this.localMountPoint}`); | ||
if (result.stderr) { | ||
throw new Error(result.stderr); | ||
} | ||
} catch (error) { | ||
const errorMessage: string = `${this.storageType} Shared Storage: get account key failed, error is ${error}`; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why the message contains There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is a mistake, thank you, fix it |
||
this.log.error(errorMessage); | ||
return Promise.reject(errorMessage); | ||
} | ||
return Promise.resolve(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,6 +59,7 @@ export class NFSSharedStorageService extends SharedStorageService { | |
private log: Logger; | ||
private internalStorageService: MountedStorageService; | ||
private experimentId: string; | ||
private localMounted?: string; | ||
|
||
private storageType?: SharedStorageType; | ||
private nfsServer?: string; | ||
|
@@ -83,9 +84,10 @@ export class NFSSharedStorageService extends SharedStorageService { | |
this.storageType = nfsConfig.storageType; | ||
this.nfsServer = nfsConfig.nfsServer; | ||
this.exportedDirectory = nfsConfig.exportedDirectory; | ||
if (nfsConfig.localMounted === 'nnimount') { | ||
this.localMounted = nfsConfig.localMounted; | ||
if (this.localMounted === 'nnimount') { | ||
await this.helpLocalMount(); | ||
} else if (nfsConfig.localMounted === 'nomount') { | ||
} else if (this.localMounted === 'nomount') { | ||
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount'.`; | ||
this.log.error(errorMessage); | ||
return Promise.reject(errorMessage); | ||
|
@@ -122,6 +124,15 @@ export class NFSSharedStorageService extends SharedStorageService { | |
} | ||
} | ||
|
||
public get remoteUmountCommand(): string { | ||
if (this.remoteMountPoint) { | ||
return `sudo umount -f -l ${this.remoteMountPoint}`; | ||
} else { | ||
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`); | ||
return ''; | ||
} | ||
} | ||
|
||
private getCommand(mountPoint: string): string { | ||
const install = `rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "${INSTALL_NFS_CLIENT.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`; | ||
const mount = `mkdir -p ${mountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${mountPoint}`; | ||
|
@@ -157,4 +168,21 @@ export class NFSSharedStorageService extends SharedStorageService { | |
|
||
return Promise.resolve(); | ||
} | ||
|
||
public async cleanUp(): Promise<void> { | ||
if (this.localMounted !== 'nnimount') { | ||
return Promise.resolve(); | ||
} | ||
try { | ||
const result = await cpp.exec(`sudo umount -f -l ${this.localMountPoint}`); | ||
if (result.stderr) { | ||
throw new Error(result.stderr); | ||
} | ||
} catch (error) { | ||
const errorMessage: string = `${this.storageType} Shared Storage: Mount ${this.nfsServer}:${this.exportedDirectory} to ${this.localMountPoint} failed, error is ${error}`; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. error message is not correct. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix it |
||
this.log.error(errorMessage); | ||
return Promise.reject(errorMessage); | ||
} | ||
return Promise.resolve(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -348,6 +348,11 @@ class TrialDispatcher implements TrainingService { | |
for (const commandChannel of this.commandChannelSet) { | ||
await commandChannel.stop(); | ||
} | ||
if (this.useSharedStorage) { | ||
this.log.info(`stopping shared storage...`) | ||
component.get<SharedStorageService>(SharedStorageService).cleanUp(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. await? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, add it |
||
this.log.info(`shared storage stopped.`) | ||
} | ||
} | ||
|
||
private async environmentMaintenanceLoop(): Promise<void> { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like resolving escape characters but can't understand what exactly it is doing.
Please describe what kind of script are you trying to escape or unescape. And if it is possible, we should avoid handling escape manually.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This wants to solve the situation that
bash -c
nestedecho "some command \\"something\\""
. This happens in sharedstorage mount command.