diff --git a/.changeset/twelve-olives-scream.md b/.changeset/twelve-olives-scream.md new file mode 100644 index 0000000000..e0ad5de9e9 --- /dev/null +++ b/.changeset/twelve-olives-scream.md @@ -0,0 +1,6 @@ +--- +"@scow/portal-server": patch +"@scow/mis-server": patch +--- + +在门户和管理系统启动时只检查启用中集群登录节点的 ssh 连接,在管理系统启用集群操作中检查登录节点的 ssh 连接 diff --git a/apps/mis-server/src/bl/PriceMap.ts b/apps/mis-server/src/bl/PriceMap.ts index baa696a2d4..1a9c60b0fd 100644 --- a/apps/mis-server/src/bl/PriceMap.ts +++ b/apps/mis-server/src/bl/PriceMap.ts @@ -15,12 +15,13 @@ import { Logger } from "@ddadaal/tsgrpc-server"; import { MySqlDriver, SqlEntityManager } from "@mikro-orm/mysql"; import { Partition } from "@scow/scheduler-adapter-protos/build/protos/config"; import { calculateJobPrice } from "src/bl/jobPrice"; -import { configClusters } from "src/config/clusters"; import { misConfig } from "src/config/mis"; import { JobPriceInfo } from "src/entities/JobInfo"; import { AmountStrategy, JobPriceItem } from "src/entities/JobPriceItem"; import { ClusterPlugin } from "src/plugins/clusters"; +import { getActivatedClusters } from "./clustersUtils"; + export interface JobInfo { // cluster job id jobId: number; @@ -88,18 +89,30 @@ export async function createPriceMap( return price; }; - // partitions info for all clusters - const partitionsForClusters: Record = {}; - // call for all config clusters - const reply = await clusterPlugin.callOnAll( - configClusters, - logger, - async (client) => await asyncClientCall(client.config, "getClusterConfig", {}), - ); - reply.forEach((x) => { - partitionsForClusters[x.cluster] = x.result.partitions; + + // call for all activated clusters + const activatedClusters = await getActivatedClusters(em, logger).catch((e) => { + logger.info("!!![important] No available activated clusters.This will skip creating price map in cluster!!!"); + logger.info(e); + return {}; }); + + // partitions info for activated clusters + const partitionsForClusters: Record = {}; + + await Promise.allSettled(Object.keys(activatedClusters).map(async (cluster) => { + try { + const result = await clusterPlugin.callOnOne( + cluster, + logger, + async (client) => await asyncClientCall(client.config, "getClusterConfig", {}), + ); + partitionsForClusters[cluster] = result.partitions; + } catch (error) { + logger.info(`Can not get cluster's (clusterId: ${cluster}) config info from adapter.`, error); + }; + })); return { @@ -109,7 +122,14 @@ export async function createPriceMap( const missingPaths = [] as string[]; - for (const cluster in configClusters) { + for (const cluster in activatedClusters) { + + if (!partitionsForClusters[cluster]) { + logger.info( + `Can not get missing default price items from partitions of cluster (clusterId: ${cluster}) currently.`); + continue; + } + for (const partition of partitionsForClusters[cluster]) { const path = [cluster, partition.name]; const { qos } = partition; diff --git a/apps/mis-server/src/bl/clustersUtils.ts b/apps/mis-server/src/bl/clustersUtils.ts index 82e7d00a73..38736c6cdf 100644 --- a/apps/mis-server/src/bl/clustersUtils.ts +++ b/apps/mis-server/src/bl/clustersUtils.ts @@ -84,7 +84,7 @@ export async function getClustersRuntimeInfo( }); const clusterDatabaseList = clustersFromDb.map((x) => { - return `(Cluster ID: ${x.clusterId}) : ${x.activationStatus}`; + return `Cluster ID: ${x.clusterId}, Current Status: ${x.activationStatus}`; }).join("; "); logger.info("Current clusters list: %s", clusterDatabaseList); diff --git a/apps/mis-server/src/plugins/clusters.ts b/apps/mis-server/src/plugins/clusters.ts index f54c73fca8..b351affe02 100644 --- a/apps/mis-server/src/plugins/clusters.ts +++ b/apps/mis-server/src/plugins/clusters.ts @@ -17,7 +17,7 @@ import { ClusterConfigSchema, getLoginNode } from "@scow/config/build/cluster"; import { getSchedulerAdapterClient, SchedulerAdapterClient } from "@scow/lib-scheduler-adapter"; import { scowErrorMetadata } from "@scow/lib-server/build/error"; import { testRootUserSshLogin } from "@scow/lib-ssh"; -import { updateCluster } from "src/bl/clustersUtils"; +import { getActivatedClusters, updateCluster } from "src/bl/clustersUtils"; import { configClusters } from "src/config/clusters"; import { rootKeyPair } from "src/config/env"; @@ -52,8 +52,20 @@ export const ADAPTER_CALL_ON_ONE_ERROR = "ADAPTER_CALL_ON_ONE_ERROR"; export const clustersPlugin = plugin(async (f) => { + // initial clusters database + const configClusterIds = Object.keys(configClusters); + await updateCluster(f.ext.orm.em.fork(), configClusterIds, f.logger); + if (process.env.NODE_ENV === "production") { - await Promise.all(Object.values(configClusters).map(async ({ displayName, loginNodes }) => { + + // only check activated clusters' root user login when system is starting + const activatedClusters = await getActivatedClusters(f.ext.orm.em.fork(), f.logger).catch((e) => { + f.logger.info("!!![important] No available activated clusters.This will skip root ssh login check in cluster!!!"); + f.logger.info(e); + return {}; + }); + + await Promise.all(Object.values(activatedClusters).map(async ({ displayName, loginNodes }) => { const loginNode = getLoginNode(loginNodes[0]); const address = loginNode.address; const node = loginNode.name; @@ -66,11 +78,8 @@ export const clustersPlugin = plugin(async (f) => { f.logger.info("Root can login to %s by login node %s", displayName, node); } })); - } - // initial clusters database - const configClusterIds = Object.keys(configClusters); - await updateCluster(f.ext.orm.em.fork(), configClusterIds, f.logger); + } // adapterClient of all config clusters const adapterClientForClusters = Object.entries(configClusters).reduce((prev, [cluster, c]) => { diff --git a/apps/mis-server/src/services/init.ts b/apps/mis-server/src/services/init.ts index 72c6bb86f8..0402748b1e 100644 --- a/apps/mis-server/src/services/init.ts +++ b/apps/mis-server/src/services/init.ts @@ -17,6 +17,7 @@ import { UniqueConstraintViolationException } from "@mikro-orm/core"; import { createUser } from "@scow/lib-auth"; import { InitServiceServer, InitServiceService } from "@scow/protos/build/server/init"; import { authUrl } from "src/config"; +import { configClusters } from "src/config/clusters"; import { SystemState } from "src/entities/SystemState"; import { PlatformRole, TenantRole, User } from "src/entities/User"; import { DEFAULT_TENANT_NAME } from "src/utils/constants"; @@ -72,7 +73,8 @@ export const initServiceServer = plugin((server) => { server.logger) .then(async () => { // 插入公钥失败也认为是创建用户成功 - await insertKeyToNewUser(userId, password, server.logger) + // 在所有集群下执行 + await insertKeyToNewUser(userId, password, server.logger, configClusters) .catch(() => null); return true; }) diff --git a/apps/mis-server/src/services/misConfig.ts b/apps/mis-server/src/services/misConfig.ts index f0be6550f8..cfb6c2d240 100644 --- a/apps/mis-server/src/services/misConfig.ts +++ b/apps/mis-server/src/services/misConfig.ts @@ -13,9 +13,13 @@ import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { plugin } from "@ddadaal/tsgrpc-server"; import { ServiceError, status } from "@grpc/grpc-js"; +import { getLoginNode } from "@scow/config/build/cluster"; +import { testRootUserSshLogin } from "@scow/lib-ssh"; import { ClusterRuntimeInfo_LastActivationOperation, ConfigServiceServer, ConfigServiceService } from "@scow/protos/build/server/config"; import { getActivatedClusters, getClustersRuntimeInfo } from "src/bl/clustersUtils"; +import { configClusters } from "src/config/clusters"; +import { rootKeyPair } from "src/config/env"; import { Cluster, ClusterActivationStatus } from "src/entities/Cluster"; export const misConfigServiceServer = plugin((server) => { @@ -75,52 +79,77 @@ export const misConfigServiceServer = plugin((server) => { activateCluster: async ({ request, em, logger }) => { const { clusterId, operatorId } = request; - const cluster = await em.findOne(Cluster, { clusterId }); - - if (!cluster) { - throw { - code: status.NOT_FOUND, message: `Cluster( Cluster ID: ${clusterId}) is not found`, - } as ServiceError; - } + return await em.transactional(async (em) => { + const cluster = await em.findOne(Cluster, { clusterId }); - // check current scheduler adapter connection state - // do not need check cluster's activation - await server.ext.clusters.callOnOne( - clusterId, - logger, - async (client) => await asyncClientCall(client.config, "getClusterConfig", {}), - ).catch((e) => { - logger.info("Cluster Connection Error ( Cluster ID : %s , Details: %s ) .", cluster, e); - throw { - code: status.FAILED_PRECONDITION, - message: `Activate cluster failed, Cluster( Cluster ID: ${clusterId}) is currently unreachable.`, - } as ServiceError; - }); + if (!cluster) { + throw { + code: status.NOT_FOUND, message: `Cluster( Cluster ID: ${clusterId}) is not found`, + } as ServiceError; + } - // when the cluster has already been activated - if (cluster.activationStatus === ClusterActivationStatus.ACTIVATED) { - logger.info("Cluster (Cluster ID: %s) has already been activated", + // check current scheduler adapter connection state + // do not need check cluster's activation + await server.ext.clusters.callOnOne( + clusterId, + logger, + async (client) => await asyncClientCall(client.config, "getClusterConfig", {}), + ).catch((e) => { + logger.info("Cluster Connection Error ( Cluster ID : %s , Details: %s ) .", cluster, e); + throw { + code: status.FAILED_PRECONDITION, + message: `Activate cluster failed, Cluster( Cluster ID: ${clusterId}) is currently unreachable.`, + } as ServiceError; + }); + + // when the cluster has already been activated + if (cluster.activationStatus === ClusterActivationStatus.ACTIVATED) { + logger.info("Cluster (Cluster ID: %s) has already been activated", + clusterId, + ); + return [{ executed: false }]; + } + + // check root user ssh login in the target cluster + const targetClusterLoginNodes = configClusters[clusterId].loginNodes; + + const loginNode = getLoginNode(targetClusterLoginNodes[0]); + const address = loginNode.address; + const node = loginNode.name; + logger.info("Checking if root can login to cluster (clusterId: %s) by login node %s", + clusterId, node); + const error = await testRootUserSshLogin(address, rootKeyPair, logger); + + if (error) { + logger.info("Root cannot login to cluster (clusterId: %s) by login node %s. err: %o", + clusterId, node, error); + throw { + code: status.FAILED_PRECONDITION, + message: `Activate cluster failed, root login check failed in Cluster( Cluster ID: ${clusterId}) .`, + } as ServiceError; + } else { + logger.info("Root can login to cluster (clusterId: %s) by login node %s", clusterId, node); + } + + cluster.activationStatus = ClusterActivationStatus.ACTIVATED; + + // save operator userId in lastActivationOperation + const lastActivationOperationMap: ClusterRuntimeInfo_LastActivationOperation = {}; + + lastActivationOperationMap.operatorId = operatorId; + cluster.lastActivationOperation = lastActivationOperationMap; + + await em.persistAndFlush(cluster); + + logger.info("Cluster (Cluster ID: %s) is successfully activated by user (User Id: %s)", clusterId, + operatorId, ); - return [{ executed: false }]; - } - - cluster.activationStatus = ClusterActivationStatus.ACTIVATED; - - // save operator userId in lastActivationOperation - const lastActivationOperationMap: ClusterRuntimeInfo_LastActivationOperation = {}; - - lastActivationOperationMap.operatorId = operatorId; - cluster.lastActivationOperation = lastActivationOperationMap; - await em.persistAndFlush(cluster); - logger.info("Cluster (Cluster ID: %s) is successfully activated by user (User Id: %s)", - clusterId, - operatorId, - ); + return [{ executed: true }]; - return [{ executed: true }]; + }); }, diff --git a/apps/mis-server/src/services/tenant.ts b/apps/mis-server/src/services/tenant.ts index 3e62552074..485a7d2a37 100644 --- a/apps/mis-server/src/services/tenant.ts +++ b/apps/mis-server/src/services/tenant.ts @@ -20,6 +20,7 @@ import { TenantServiceServer, TenantServiceService } from "@scow/protos/build/se import { blockAccount, unblockAccount } from "src/bl/block"; import { getActivatedClusters } from "src/bl/clustersUtils"; import { authUrl } from "src/config"; +import { configClusters } from "src/config/clusters"; import { Account } from "src/entities/Account"; import { Tenant } from "src/entities/Tenant"; import { TenantRole, User } from "src/entities/User"; @@ -147,7 +148,9 @@ export const tenantServiceServer = plugin((server) => { { identityId: user.userId, id: user.id, mail: user.email, name: user.name, password: userPassword }, logger) .then(async () => { - await insertKeyToNewUser(userId, userPassword, logger) + // 插入公钥失败也认为是创建用户成功 + // 在所有集群下执行 + await insertKeyToNewUser(userId, userPassword, logger, configClusters) .catch(() => { }); return true; }) diff --git a/apps/mis-server/src/services/user.ts b/apps/mis-server/src/services/user.ts index 490113dea7..6263b38418 100644 --- a/apps/mis-server/src/services/user.ts +++ b/apps/mis-server/src/services/user.ts @@ -34,6 +34,7 @@ import { import { blockUserInAccount, unblockUserInAccount } from "src/bl/block"; import { getActivatedClusters } from "src/bl/clustersUtils"; import { authUrl } from "src/config"; +import { configClusters } from "src/config/clusters"; import { Account } from "src/entities/Account"; import { Tenant } from "src/entities/Tenant"; import { PlatformRole, TenantRole, User } from "src/entities/User"; @@ -440,7 +441,9 @@ export const userServiceServer = plugin((server) => { server.logger) .then(async () => { // insert public key - await insertKeyToNewUser(identityId, password, server.logger) + // 插入公钥失败也认为是创建用户成功 + // 在所有集群下执行 + await insertKeyToNewUser(identityId, password, server.logger, configClusters) .catch(() => {}); return true; }) diff --git a/apps/mis-server/src/tasks/fetch.ts b/apps/mis-server/src/tasks/fetch.ts index aba6609f89..2fcc541fb6 100644 --- a/apps/mis-server/src/tasks/fetch.ts +++ b/apps/mis-server/src/tasks/fetch.ts @@ -80,7 +80,11 @@ export async function fetchJobs( const persistJobAndCharge = async (jobs: ({ cluster: string } & ClusterJobInfo)[]) => { const result = await em.transactional(async (em) => { - const currentActivatedClusters = await getActivatedClusters(em, logger); + const currentActivatedClusters = await getActivatedClusters(em, logger).catch((e) => { + logger.info("!!![important] No available activated clusters.This will skip fetching Jobs in cluster!!!"); + logger.info(e); + return {}; + }); // Calculate prices for new info and persist const pricedJobs: JobInfo[] = []; diff --git a/apps/mis-server/src/utils/createUser.ts b/apps/mis-server/src/utils/createUser.ts index 2ce8d85861..237f3fc9fc 100644 --- a/apps/mis-server/src/utils/createUser.ts +++ b/apps/mis-server/src/utils/createUser.ts @@ -15,7 +15,7 @@ import { ServiceError } from "@grpc/grpc-js"; import { Status } from "@grpc/grpc-js/build/src/constants"; import { UniqueConstraintViolationException } from "@mikro-orm/core"; import { MySqlDriver, SqlEntityManager } from "@mikro-orm/mysql"; -import { getLoginNode } from "@scow/config/build/cluster"; +import { ClusterConfigSchema, getLoginNode } from "@scow/config/build/cluster"; import { insertKeyAsUser } from "@scow/lib-ssh"; import { configClusters } from "src/config/clusters"; import { rootKeyPair } from "src/config/env"; @@ -66,11 +66,12 @@ export async function insertKeyToNewUser( userId: string, password: string, logger: Logger, + currentClusters: Record, ) { // Making an ssh Request to the login node as the user created. if (process.env.NODE_ENV === "production") { - await Promise.all(Object.values(configClusters).map(async ({ displayName, loginNodes }) => { + await Promise.all(Object.values(currentClusters).map(async ({ displayName, loginNodes }) => { const node = getLoginNode(loginNodes[0]); logger.info("Checking if user can login to %s by login node %s", displayName, node.name); diff --git a/apps/mis-web/src/pageComponents/admin/ClusterManagementTable.tsx b/apps/mis-web/src/pageComponents/admin/ClusterManagementTable.tsx index 39975096db..8559271487 100644 --- a/apps/mis-web/src/pageComponents/admin/ClusterManagementTable.tsx +++ b/apps/mis-web/src/pageComponents/admin/ClusterManagementTable.tsx @@ -188,7 +188,6 @@ export const ClusterManagementTable: React.FC = ({ } { r.hpcEnabled && r.activationStatus === ClusterActivationStatus.DEACTIVATED - && r.connectionStatus === ClusterConnectionStatus.AVAILABLE && ( <> { diff --git a/apps/mis-web/src/utils/route.ts b/apps/mis-web/src/utils/route.ts index e6ff73d2b7..f38985416c 100644 --- a/apps/mis-web/src/utils/route.ts +++ b/apps/mis-web/src/utils/route.ts @@ -28,7 +28,7 @@ export const route: typeof typeboxRoute = (schema, handler) => { const SCOW_ERROR = e.metadata.get("IS_SCOW_ERROR"); if (!SCOW_ERROR) { throw e; } - const code = e.metadata.get("SCOW_ERROR_CODE")[0].toString(); + const code = e.metadata.get("SCOW_ERROR_CODE")?.[0]?.toString(); const details = e.details; // 如果包含集群详细错误信息 diff --git a/apps/portal-server/src/app.ts b/apps/portal-server/src/app.ts index 826c502efa..55c24d3449 100644 --- a/apps/portal-server/src/app.ts +++ b/apps/portal-server/src/app.ts @@ -12,6 +12,7 @@ import { Server } from "@ddadaal/tsgrpc-server"; import { omitConfigSpec } from "@scow/lib-config"; +import { libGetCurrentActivatedClusters } from "@scow/lib-server"; import { readVersionFile } from "@scow/utils/build/version"; import { configClusters } from "src/config/clusters"; import { config } from "src/config/env"; @@ -28,6 +29,8 @@ import { setupProxyGateway } from "src/utils/proxy"; import { initShellFile } from "src/utils/shell"; import { checkClustersRootUserLogin } from "src/utils/ssh"; +import { commonConfig } from "./config/common"; + export async function createServer() { const server = new Server({ @@ -51,17 +54,20 @@ export async function createServer() { await server.register(dashboardServiceServer); await server.register(fileServiceServer); await server.register(desktopServiceServer); - if (process.env.NODE_ENV === "production") { - await checkClustersRootUserLogin(server.logger); - await Promise.all(Object.entries(configClusters).map(async ([id]) => { + const activatedClusters = await libGetCurrentActivatedClusters( + server.logger, + configClusters, + config.MIS_SERVER_URL, + commonConfig.scowApi?.auth?.token); + + await checkClustersRootUserLogin(server.logger, activatedClusters); + await Promise.all(Object.entries(activatedClusters).map(async ([id]) => { await initShellFile(id, server.logger); })); - await setupProxyGateway(server.logger); + await setupProxyGateway(server.logger, activatedClusters); } - - return server; } diff --git a/apps/portal-server/src/utils/proxy.ts b/apps/portal-server/src/utils/proxy.ts index 1e6e265df2..316bcbab3c 100644 --- a/apps/portal-server/src/utils/proxy.ts +++ b/apps/portal-server/src/utils/proxy.ts @@ -10,6 +10,7 @@ * See the Mulan PSL v2 for more details. */ +import { ClusterConfigSchema } from "@scow/config/build/cluster"; import { loggedExec, sftpWriteFile } from "@scow/lib-ssh"; import { dirname } from "path"; import { configClusters } from "src/config/clusters"; @@ -17,12 +18,12 @@ import { config } from "src/config/env"; import { sshConnect } from "src/utils/ssh"; import { Logger } from "ts-log"; -export const setupProxyGateway = async (logger: Logger) => { +export const setupProxyGateway = async (logger: Logger, activatedClusters: Record) => { let portalBasePath = config.PORTAL_BASE_PATH; if (!portalBasePath.endsWith("/")) { portalBasePath += "/"; } - for (const id of Object.keys(configClusters)) { + for (const id of Object.keys(activatedClusters)) { const proxyGatewayConfig = configClusters[id].proxyGateway; diff --git a/apps/portal-server/src/utils/ssh.ts b/apps/portal-server/src/utils/ssh.ts index 39f225d6aa..6e3156f611 100644 --- a/apps/portal-server/src/utils/ssh.ts +++ b/apps/portal-server/src/utils/ssh.ts @@ -12,7 +12,7 @@ import { ServiceError } from "@ddadaal/tsgrpc-common"; import { status } from "@grpc/grpc-js"; -import { getLoginNode } from "@scow/config/build/cluster"; +import { ClusterConfigSchema, getLoginNode } from "@scow/config/build/cluster"; import { scowErrorMetadata } from "@scow/lib-server/build/error"; import { SftpError, sshConnect as libConnect, SshConnectError, testRootUserSshLogin } from "@scow/lib-ssh"; import { NodeSSH } from "node-ssh"; @@ -34,7 +34,7 @@ export function getConfigClusterLoginNode(cluster: string): string | undefined { return loginNode?.address; } -// TODO: 不要?在线集群节点信息 +// 获取集群中各节点信息 export function getClusterLoginNode(cluster: string): string | undefined { const loginNode = getLoginNode(configClusters[cluster]?.loginNodes?.[0]); return loginNode?.address; @@ -119,8 +119,11 @@ export async function sshConnect( /** * Check whether all clusters can be logged in as root user */ -export async function checkClustersRootUserLogin(logger: Logger) { - await Promise.all(Object.values(configClusters).map(async ({ displayName, loginNodes }) => { +export async function checkClustersRootUserLogin( + logger: Logger, + activatedClusters: Record, +) { + await Promise.all(Object.values(activatedClusters).map(async ({ displayName, loginNodes }) => { const node = getLoginNode(loginNodes[0]); logger.info("Checking if root can login to %s by login node %s", displayName, node.name); const error = await testRootUserSshLogin(node.address, rootKeyPair, console); diff --git a/apps/portal-web/src/utils/route.ts b/apps/portal-web/src/utils/route.ts index f7cdba7d40..b3a747ab0e 100644 --- a/apps/portal-web/src/utils/route.ts +++ b/apps/portal-web/src/utils/route.ts @@ -30,7 +30,7 @@ export const route: typeof typeboxRoute = (schema, handler) => { const SCOW_CAUSE = (e.metadata as Metadata).get("cause"); if (SCOW_ERROR.length === 0) { throw e; } - const code = e.metadata.get("SCOW_ERROR_CODE")[0].toString(); + const code = e.metadata.get("SCOW_ERROR_CODE")?.[0]?.toString(); const details = e.details; const message = SCOW_CAUSE[0];