Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

sharedstorage support remote umount and fix bug #3456

Merged
merged 4 commits into from
Apr 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ class LinuxCommands extends OsCommands {
command = `bash '${script}'`;
} else {
script = script.replace(/"/g, '\\"');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like resolving escape characters but can't understand what exactly it is doing.
Please describe what kind of script are you trying to escape or unescape. And if it is possible, we should avoid handling escape manually.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This wants to solve the situation that bash -c nested echo "some command \\"something\\"". This happens in sharedstorage mount command.

const result = script.match(/[^\\]\\\\"/g);
if (result) {
result.forEach((res) => {
script = script.replace(res, res.replace(/"$/g, '\\"'));
})
}
command = `bash -c "${script}"`;
}
return command;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
} else {
environment.setStatus('FAILED');
}
this.releaseEnvironmentResource(environment);
await this.releaseEnvironmentResource(environment);
}
}
}
Expand All @@ -194,7 +194,16 @@ export class RemoteEnvironmentService extends EnvironmentService {
* If a environment is finished, release the connection resource
* @param environment remote machine environment job detail
*/
private releaseEnvironmentResource(environment: EnvironmentInformation): void {
private async releaseEnvironmentResource(environment: EnvironmentInformation): Promise<void> {
if (environment.useSharedStorage) {
const executor = await this.getExecutor(environment.id);
const remoteUmountCommand = component.get<SharedStorageService>(SharedStorageService).remoteUmountCommand;
const result = await executor.executeScript(remoteUmountCommand, false, false);
if (result.exitCode !== 0) {
this.log.error(`Umount shared storage on remote machine failed.\n ERROR: ${result.stderr}`);
}
}

const executorManager = this.environmentExecutorManagerMap.get(environment.id);
if (executorManager === undefined) {
throw new Error(`ExecutorManager is not assigned for environment ${environment.id}`);
Expand Down Expand Up @@ -251,8 +260,11 @@ export class RemoteEnvironmentService extends EnvironmentService {
const executor = await this.getExecutor(environment.id);
if (environment.useSharedStorage) {
this.remoteExperimentRootDir = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot;
const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand;
await executor.executeScript(remoteMountCommand, false, false);
const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand.replace(/echo -e /g, `echo `).replace(/echo /g, `echo -e `);
const result = await executor.executeScript(remoteMountCommand, false, false);
if (result.exitCode !== 0) {
throw new Error(`Mount shared storage on remote machine failed.\n ERROR: ${result.stderr}`);
}
} else {
this.remoteExperimentRootDir = executor.getRemoteExperimentRootDir(getExperimentId());
}
Expand Down Expand Up @@ -304,14 +316,14 @@ export class RemoteEnvironmentService extends EnvironmentService {

if (environment.status === 'UNKNOWN') {
environment.status = 'USER_CANCELED';
this.releaseEnvironmentResource(environment);
await this.releaseEnvironmentResource(environment);
return
}

const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
try {
await executor.killChildProcesses(jobpidPath);
this.releaseEnvironmentResource(environment);
await this.releaseEnvironmentResource(environment);
} catch (error) {
this.log.error(`stopEnvironment: ${error}`);
}
Expand Down
2 changes: 2 additions & 0 deletions ts/nni_manager/training_service/reusable/sharedStorage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ export abstract class SharedStorageService {
public abstract get storageService(): StorageService;
public abstract get localMountCommand(): string;
public abstract get remoteMountCommand(): string;
public abstract get remoteUmountCommand(): string;
public abstract get localWorkingRoot(): string;
public abstract get remoteWorkingRoot(): string;
public abstract cleanUp(): Promise<void>;
}
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
private log: Logger;
private internalStorageService: MountedStorageService;
private experimentId: string;
private localMounted?: string;

private storageType?: SharedStorageType;
private storageAccountName?: string;
Expand Down Expand Up @@ -113,10 +114,10 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}

if (azureblobConfig.localMounted === 'nnimount') {
this.localMounted = azureblobConfig.localMounted;
if (this.localMounted === 'nnimount') {
await this.helpLocalMount();
} else if (azureblobConfig.localMounted === 'nomount') {
} else if (this.localMounted === 'nomount') {
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount' yet.`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
Expand Down Expand Up @@ -154,6 +155,15 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
}
}

public get remoteUmountCommand(): string {
if (this.remoteMountPoint) {
return `sudo umount -l ${this.remoteMountPoint}`;
} else {
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`);
return '';
}
}

private getCommand(mountPoint: string): string {
const install = `rm -f nni_install_fuseblob.sh && touch nni_install_fuseblob.sh && echo "${INSTALL_BLOBFUSE.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_fuseblob.sh && bash nni_install_fuseblob.sh`;
const prepare = `sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName ${this.storageAccountName}\\naccountKey ${this.storageAccountKey}\\ncontainerName ${this.containerName}" >> nni_fuse_connection.cfg`;
Expand Down Expand Up @@ -206,4 +216,21 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
return Promise.reject(errorMessage);
}
}

public async cleanUp(): Promise<void> {
if (this.localMounted !== 'nnimount') {
return Promise.resolve();
}
try {
const result = await cpp.exec(`sudo umount -l ${this.localMountPoint}`);
if (result.stderr) {
throw new Error(result.stderr);
}
} catch (error) {
const errorMessage: string = `${this.storageType} Shared Storage: Umount ${this.localMountPoint} failed, error is ${error}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
return Promise.resolve();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ export class NFSSharedStorageService extends SharedStorageService {
private log: Logger;
private internalStorageService: MountedStorageService;
private experimentId: string;
private localMounted?: string;

private storageType?: SharedStorageType;
private nfsServer?: string;
Expand All @@ -83,9 +84,10 @@ export class NFSSharedStorageService extends SharedStorageService {
this.storageType = nfsConfig.storageType;
this.nfsServer = nfsConfig.nfsServer;
this.exportedDirectory = nfsConfig.exportedDirectory;
if (nfsConfig.localMounted === 'nnimount') {
this.localMounted = nfsConfig.localMounted;
if (this.localMounted === 'nnimount') {
await this.helpLocalMount();
} else if (nfsConfig.localMounted === 'nomount') {
} else if (this.localMounted === 'nomount') {
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount'.`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
Expand Down Expand Up @@ -122,6 +124,15 @@ export class NFSSharedStorageService extends SharedStorageService {
}
}

public get remoteUmountCommand(): string {
if (this.remoteMountPoint) {
return `sudo umount -f -l ${this.remoteMountPoint}`;
} else {
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`);
return '';
}
}

private getCommand(mountPoint: string): string {
const install = `rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "${INSTALL_NFS_CLIENT.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`;
const mount = `mkdir -p ${mountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${mountPoint}`;
Expand Down Expand Up @@ -157,4 +168,21 @@ export class NFSSharedStorageService extends SharedStorageService {

return Promise.resolve();
}

public async cleanUp(): Promise<void> {
if (this.localMounted !== 'nnimount') {
return Promise.resolve();
}
try {
const result = await cpp.exec(`sudo umount -f -l ${this.localMountPoint}`);
if (result.stderr) {
throw new Error(result.stderr);
}
} catch (error) {
const errorMessage: string = `${this.storageType} Shared Storage: Umount ${this.localMountPoint} failed, error is ${error}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
return Promise.resolve();
}
}
5 changes: 5 additions & 0 deletions ts/nni_manager/training_service/reusable/trialDispatcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,11 @@ class TrialDispatcher implements TrainingService {
for (const commandChannel of this.commandChannelSet) {
await commandChannel.stop();
}
if (this.useSharedStorage) {
this.log.info(`stopping shared storage...`)
await component.get<SharedStorageService>(SharedStorageService).cleanUp();
this.log.info(`shared storage stopped.`)
}
}

private async environmentMaintenanceLoop(): Promise<void> {
Expand Down