From 51c6e4c95e21af40f9d93b2e9c4891c40b279184 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Thu, 29 Jun 2023 11:10:52 +0800 Subject: [PATCH 01/18] feat: add wdpost task manager --- damocles-manager/go.mod | 2 +- .../modules/impl/prover/worker/task_mgr.go | 56 ++++ .../modules/impl/prover/worker/task_mgr_kv.go | 306 ++++++++++++++++++ .../modules/impl/sectors/state_mgr.go | 2 +- damocles-manager/pkg/kvstore/kvstore.go | 55 +++- 5 files changed, 415 insertions(+), 6 deletions(-) create mode 100644 damocles-manager/modules/impl/prover/worker/task_mgr.go create mode 100644 damocles-manager/modules/impl/prover/worker/task_mgr_kv.go diff --git a/damocles-manager/go.mod b/damocles-manager/go.mod index b48684a3e..7809a6ae8 100644 --- a/damocles-manager/go.mod +++ b/damocles-manager/go.mod @@ -5,6 +5,7 @@ go 1.18 require ( contrib.go.opencensus.io/exporter/prometheus v0.4.0 github.com/BurntSushi/toml v1.2.1 + github.com/cespare/xxhash/v2 v2.2.0 github.com/dgraph-io/badger/v2 v2.2007.3 github.com/docker/go-units v0.5.0 github.com/dtynn/dix v0.1.2 @@ -64,7 +65,6 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/bluele/gcache v0.0.0-20190518031135-bc40bd653833 // indirect github.com/cespare/xxhash v1.1.0 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cilium/ebpf v0.4.0 // indirect github.com/containerd/cgroups v1.0.4 // indirect github.com/coreos/go-systemd/v22 v22.4.0 // indirect diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr.go b/damocles-manager/modules/impl/prover/worker/task_mgr.go new file mode 100644 index 000000000..368be5f17 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/task_mgr.go @@ -0,0 +1,56 @@ +package worker + +import ( + "context" + "encoding/base64" + "encoding/binary" + "time" + + "github.com/cespare/xxhash/v2" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" +) + +type TaskState string + +const ( + TaskReadyToRun TaskState = "ready2run" + TaskRunning TaskState = "running" + TaskFinished TaskState = "finished" +) + +type Task struct { + ID string + Input stage.WindowPoSt + Output *stage.WindowPoStOutput + tryNum uint32 + ErrorReason string + WorkerName string + StartedAt uint64 + HeartbeatAt uint64 + FinishedAt uint64 + CreatedAt uint64 + UpdatedAt uint64 +} + +type AllocatedTask struct { + ID string + Input stage.WindowPoSt +} + +type TaskManager interface { + All(ctx context.Context, state TaskState, limit uint32, filter func(*Task) bool) ([]*Task, error) + ListByTaskIDs(ctx context.Context, state TaskState, taskIDs ...string) ([]*Task, error) + Create(ctx context.Context, input stage.WindowPoSt) (*Task, error) + AllocateTasks(ctx context.Context, n uint32, workName string) (allocatedTasks []AllocatedTask, err error) + Heartbeat(ctx context.Context, taskID []string, workerName string) error + Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error + MakeTasksDie(ctx context.Context, shouldDeadDur time.Duration, limit uint32) error + CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error + RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error +} + +func genTaskID(rawInput []byte) string { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) + return base64.URLEncoding.EncodeToString(b) +} diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go new file mode 100644 index 000000000..c974d4db5 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go @@ -0,0 +1,306 @@ +package worker + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "time" + + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" +) + +var _ TaskManager = (*kvTaskManager)(nil) + +func NewKVTaskStore(kv kvstore.ExtendKV) *kvTaskManager { + return &kvTaskManager{ + kv: kv, + } +} + +type kvTaskManager struct { + kv kvstore.ExtendKV +} + +// TODO(0x5459): Consider putting `txn` into context? +func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.ExtendTxn, state TaskState, limit uint32, f func(*Task) bool) (tasks []*Task, err error) { + var it kvstore.Iter + it, err = txn.Scan([]byte(makeWdPoStPrefix(state))) + if err != nil { + return + } + defer it.Close() + var task Task + for it.Next() && len(tasks) <= int(limit) { + if err = it.View(ctx, kvstore.LoadJson(&task)); err != nil { + return + } + if f(&task) { + tasks = append(tasks, &task) + } + } + return +} + +func (tm *kvTaskManager) All(ctx context.Context, state TaskState, limit uint32, filter func(*Task) bool) (tasks []*Task, err error) { + err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + tasks, err = tm.filter(ctx, txn, state, limit, filter) + return err + }) + return +} + +func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state TaskState, taskIDs ...string) ([]*Task, error) { + tasks := make([]*Task, 0, len(taskIDs)) + err := tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + for _, taskID := range taskIDs { + var task Task + err := txn.Peek(kvstore.Key(makeWdPoStKey(state, taskID)), kvstore.LoadJson(&task)) + if errors.Is(err, kvstore.ErrKeyNotFound) { + continue + } + if err != nil { + return err + } + tasks = append(tasks, &task) + } + return nil + }) + return tasks, err +} + +func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*Task, error) { + var ( + taskID string + task *Task + ) + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + rawInput, err := json.Marshal(input) + if err != nil { + return err + } + taskID = genTaskID(rawInput) + // check if task exists + err = txn.PeekAny( + kvstore.LoadJson(task), + kvstore.Key(makeWdPoStKey(TaskReadyToRun, taskID)), + kvstore.Key(makeWdPoStKey(TaskRunning, taskID)), + kvstore.Key(makeWdPoStKey(TaskFinished, taskID)), + ) + if err == nil { + // return if it is exists + return nil + } + if !errors.Is(err, kvstore.ErrKeyNotFound) { + return err + } + + now := time.Now().Unix() + task = &Task{ + ID: taskID, + Input: input, + Output: nil, + tryNum: 0, + ErrorReason: "", + WorkerName: "", + StartedAt: 0, + HeartbeatAt: 0, + FinishedAt: 0, + CreatedAt: uint64(now), + UpdatedAt: uint64(now), + } + return txn.PutJson([]byte(makeWdPoStKey(TaskReadyToRun, taskID)), task) + }) + + if err == nil { + log.Infof("wdPoSt task created: %s", taskID) + } + return task, err +} + +func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName string) (allocatedTasks []AllocatedTask, err error) { + var readyToRun []*Task + err = tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + readyToRun, err = tm.filter(ctx, txn, TaskReadyToRun, n, func(t *Task) bool { return true }) + if err != nil { + return err + } + now := uint64(time.Now().Unix()) + for _, task := range readyToRun { + task.tryNum++ + task.StartedAt = now + task.WorkerName = workName + task.HeartbeatAt = now + task.UpdatedAt = now + // Moving ready to run tasks to running tasks + if err := txn.Del([]byte(makeWdPoStKey(TaskReadyToRun, task.ID))); err != nil { + return err + } + if err := txn.PutJson([]byte(makeWdPoStKey(TaskRunning, task.ID)), task); err != nil { + return err + } + allocatedTasks = append(allocatedTasks, AllocatedTask{ + ID: task.ID, + Input: task.Input, + }) + } + return nil + }) + + if err == nil { + for _, task := range readyToRun { + log.Infof("allocated wdPoSt task: %s; try_num: %d", task.ID, task.tryNum) + } + } + return +} + +func (tm *kvTaskManager) Heartbeat(ctx context.Context, taskIDs []string, workerName string) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + for _, taskID := range taskIDs { + var task Task + if err := txn.Peek([]byte(makeWdPoStKey(TaskRunning, taskID)), kvstore.LoadJson(&task)); err != nil { + return err + } + now := uint64(time.Now().Unix()) + task.HeartbeatAt = now + task.WorkerName = workerName + task.UpdatedAt = now + if err := txn.PutJson([]byte(makeWdPoStKey(TaskRunning, taskID)), &task); err != nil { + return err + } + } + return nil + }) + if err == nil { + log.With("worker_name", workerName).Debug("wdPoSt tasks heartbeat", taskIDs) + } + return err +} + +func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + runningKey := []byte(makeWdPoStKey(TaskRunning, taskID)) + var task Task + if err := txn.Peek(runningKey, kvstore.LoadJson(&task)); err != nil { + return err + } + if err := txn.Del(runningKey); err != nil { + return err + } + now := uint64(time.Now().Unix()) + task.Output = output + task.ErrorReason = errorReason + task.FinishedAt = now + task.UpdatedAt = now + return txn.PutJson([]byte(makeWdPoStKey(TaskFinished, taskID)), &task) + }) + + if err == nil { + if len(errorReason) == 0 { + log.Infof("wdPoSt task succeeded: %s", taskID) + } else { + log.Warnf("wdPoSt task failed: %s; error_reason: %s", taskID, errorReason) + } + } + return err +} + +func (ts *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time.Duration, limit uint32) error { + var shouldDead []*Task + shouldDeadTime := time.Now().Add(-heartbeatTimeout) + + err := ts.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + var err error + shouldDead, err = ts.filter(ctx, txn, TaskRunning, limit, func(t *Task) bool { + return t.HeartbeatAt > 0 && time.Unix(int64(t.HeartbeatAt), 0).Before(shouldDeadTime) + }) + if err != nil { + return err + } + now := uint64(time.Now().Unix()) + for _, task := range shouldDead { + if err := txn.Del([]byte(makeWdPoStKey(TaskRunning, task.ID))); err != nil { + return err + } + task.FinishedAt = now + task.ErrorReason = "heartbeat timeout" + task.UpdatedAt = now + if err := txn.PutJson([]byte(makeWdPoStKey(TaskFinished, task.ID)), task); err != nil { + return err + } + } + return nil + }) + + return err +} + +func (tm *kvTaskManager) CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error { + var shouldClean []*Task + shouldCleanTime := time.Now().Add(-taskLifetime) + + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + var err error + shouldClean, err = tm.filter(ctx, txn, TaskFinished, limit, func(t *Task) bool { + return time.Unix(int64(t.CreatedAt), 0).Before(shouldCleanTime) + }) + if err != nil { + return err + } + for _, task := range shouldClean { + if err := txn.Del([]byte(makeWdPoStKey(TaskFinished, task.ID))); err != nil { + return err + } + } + return nil + }) + + if err == nil { + for _, task := range shouldClean { + log.Infof("cleanup expired wdPoSt task: %s; created_at: %s", task.ID, time.Unix(int64(task.CreatedAt), 0).Format(time.RFC3339)) + } + } + return err +} + +func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error { + var shouldRetry []*Task + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + var err error + shouldRetry, err = tm.filter(ctx, txn, TaskFinished, limit, func(t *Task) bool { + return len(t.ErrorReason) != 0 && t.tryNum > maxTry + }) + if err != nil { + return err + } + now := uint64(time.Now().Unix()) + for _, task := range shouldRetry { + task.ErrorReason = "" + task.StartedAt = 0 + task.FinishedAt = 0 + task.UpdatedAt = now + if err := txn.PutJson([]byte(makeWdPoStKey(TaskFinished, task.ID)), task); err != nil { + return err + } + } + return nil + }) + + if err == nil { + for _, task := range shouldRetry { + log.Debugf("retry wdPoSt task: %d; try_num: %d, error_reason: %s", task.ID, task.tryNum) + } + } + + return err +} + +func makeWdPoStPrefix(state TaskState) string { + return fmt.Sprintf("wdpost-%s-", state) +} + +func makeWdPoStKey(state TaskState, taskID string) string { + return fmt.Sprintf("%s%s", makeWdPoStPrefix(state), taskID) +} diff --git a/damocles-manager/modules/impl/sectors/state_mgr.go b/damocles-manager/modules/impl/sectors/state_mgr.go index bec3984cc..17824c923 100644 --- a/damocles-manager/modules/impl/sectors/state_mgr.go +++ b/damocles-manager/modules/impl/sectors/state_mgr.go @@ -209,7 +209,7 @@ func (sm *StateManager) InitWith(ctx context.Context, sectors []*core.AllocatedS return fmt.Errorf("init: %w", err) } - kvExtend := kvstore.NewExtend(kv) + kvExtend := kvstore.NewExtendKV(kv) err = kvExtend.MustNoConflict(func() error { return kv.Update(ctx, func(txn kvstore.Txn) error { for _, sector := range sectors { diff --git a/damocles-manager/pkg/kvstore/kvstore.go b/damocles-manager/pkg/kvstore/kvstore.go index b7d04c737..1fa32ea85 100644 --- a/damocles-manager/pkg/kvstore/kvstore.go +++ b/damocles-manager/pkg/kvstore/kvstore.go @@ -1,6 +1,8 @@ package kvstore import ( + "context" + "encoding/json" "errors" pluginkvstore "github.com/ipfs-force-community/damocles/manager-plugin/kvstore" @@ -27,17 +29,23 @@ type ( Txn = pluginkvstore.Txn ) -func NewExtend(kvStore KVStore) *Extend { - return &Extend{ +var LoadJson = func(target any) func(Val) error { + return func(data Val) error { + return json.Unmarshal(data, target) + } +} + +func NewExtendKV(kvStore KVStore) *ExtendKV { + return &ExtendKV{ KVStore: kvStore, } } -type Extend struct { +type ExtendKV struct { KVStore } -func (kv *Extend) MustNoConflict(f func() error) error { +func (kv *ExtendKV) MustNoConflict(f func() error) error { if kv.NeedRetryTransactions() { for { err := f() @@ -49,3 +57,42 @@ func (kv *Extend) MustNoConflict(f func() error) error { return f() } } + +func (kv *ExtendKV) UpdateMustNoConflict(ctx context.Context, f func(txn ExtendTxn) error) error { + return kv.MustNoConflict(func() error { + return kv.Update(ctx, func(t Txn) error { + return f(ExtendTxn{Txn: t}) + }) + }) +} + +func (kv *ExtendKV) ViewMustNoConflict(ctx context.Context, f func(txn ExtendTxn) error) error { + return kv.MustNoConflict(func() error { + return kv.View(ctx, func(t Txn) error { + return f(ExtendTxn{Txn: t}) + }) + }) +} + +type ExtendTxn struct { + Txn +} + +func (et ExtendTxn) PeekAny(f func(Val) error, keys ...Key) error { + for _, k := range keys { + err := et.Peek(k, f) + if errors.Is(err, ErrKeyNotFound) { + continue + } + return err + } + return ErrKeyNotFound +} + +func (et ExtendTxn) PutJson(k Key, v any) error { + b, err := json.Marshal(v) + if err != nil { + return err + } + return et.Put(k, b) +} From b1aba884613ee48ac8f65ef0f61cad5eadefca84 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Thu, 29 Jun 2023 13:51:59 +0800 Subject: [PATCH 02/18] feat: impl worker prover --- damocles-manager/cmd/plugin/internal/build.go | 2 +- .../modules/impl/prover/ext/prover.go | 63 +----- .../modules/impl/prover/prover.go | 72 +++++++ .../modules/impl/prover/worker/prover.go | 180 ++++++++++++++++++ .../modules/impl/prover/worker/task_mgr.go | 14 +- .../modules/impl/prover/worker/task_mgr_kv.go | 46 ++--- .../modules/impl/sectors/state_mgr.go | 4 +- damocles-manager/pkg/kvstore/kvstore.go | 24 +-- 8 files changed, 309 insertions(+), 96 deletions(-) create mode 100644 damocles-manager/modules/impl/prover/worker/prover.go diff --git a/damocles-manager/cmd/plugin/internal/build.go b/damocles-manager/cmd/plugin/internal/build.go index 05fb2c24f..bfa13d26f 100644 --- a/damocles-manager/cmd/plugin/internal/build.go +++ b/damocles-manager/cmd/plugin/internal/build.go @@ -86,7 +86,7 @@ func goBuild(ctx context.Context, goc, srcDir, outDir string) error { if err != nil { return fmt.Errorf("read pkg %s's manifest failure: %w", srcDir, err) } - manifest["buildTime"] = time.Now().Format("2006.01.02 15:04:05") + manifest["buildTime"] = time.Now().Format(time.RFC3339) pluginName := manifest["name"].(string) tmpl, err := template.New("gen-plugin").Parse(codeTemplate) diff --git a/damocles-manager/modules/impl/prover/ext/prover.go b/damocles-manager/modules/impl/prover/ext/prover.go index 1d411d692..28d6f11de 100644 --- a/damocles-manager/modules/impl/prover/ext/prover.go +++ b/damocles-manager/modules/impl/prover/ext/prover.go @@ -80,71 +80,20 @@ func (*Prover) AggregateSealProofs(ctx context.Context, aggregateInfo core.Aggre } func (p *Prover) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectors prover.SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, []abi.SectorID, error) { - randomness[31] &= 0x3f if p.windowProc == nil { return prover.Prover.GenerateWindowPoSt(ctx, minerID, sectors, randomness) } - sectorInners := sectors.Values() - if len(sectorInners) == 0 { - return nil, nil, nil - } - - proofType := sectorInners[0].PoStProofType - data := stage.WindowPoSt{ - MinerID: minerID, - ProofType: stage.ProofType2String(proofType), - } - copy(data.Seed[:], randomness[:]) - - for i := range sectorInners { - inner := sectorInners[i] - - if pt := inner.PoStProofType; pt != proofType { - return nil, nil, fmt.Errorf("proof type not match for sector %d of miner %d: want %s, got %s", inner.SectorNumber, minerID, stage.ProofType2String(proofType), stage.ProofType2String(pt)) - } - - commR, err := util.CID2ReplicaCommitment(inner.SealedCID) + return prover.ExtGenerateWindowPoSt(minerID, sectors, randomness)(func(data stage.WindowPoSt) (stage.WindowPoStOutput, error) { + var res stage.WindowPoStOutput + err := p.windowProc.Process(ctx, data, &res) if err != nil { - return nil, nil, fmt.Errorf("invalid selaed cid %s for sector %d of miner %d: %w", inner.SealedCID, inner.SectorNumber, minerID, err) + return res, fmt.Errorf("WindowPoStProcessor.Process: %w", err) } + return res, nil + }) - data.Replicas = append(data.Replicas, stage.PoStReplicaInfo{ - SectorID: inner.SectorNumber, - CommR: commR, - CacheDir: inner.CacheDirPath, - SealedFile: inner.SealedSectorPath, - }) - } - - var res stage.WindowPoStOutput - - err := p.windowProc.Process(ctx, data, &res) - if err != nil { - return nil, nil, fmt.Errorf("WindowPoStProcessor.Process: %w", err) - } - - if faultCount := len(res.Faults); faultCount != 0 { - faults := make([]abi.SectorID, faultCount) - for fi := range res.Faults { - faults[fi] = abi.SectorID{ - Miner: minerID, - Number: res.Faults[fi], - } - } - - return nil, faults, fmt.Errorf("got %d fault sectors", faultCount) - } - - proofs := make([]builtin.PoStProof, len(res.Proofs)) - for pi := range res.Proofs { - proofs[pi] = builtin.PoStProof{ - PoStProof: proofType, - ProofBytes: res.Proofs[pi], - } - } - return proofs, nil, nil } func (p *Prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectors prover.SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { diff --git a/damocles-manager/modules/impl/prover/prover.go b/damocles-manager/modules/impl/prover/prover.go index 1e568958d..09ced102e 100644 --- a/damocles-manager/modules/impl/prover/prover.go +++ b/damocles-manager/modules/impl/prover/prover.go @@ -1,7 +1,13 @@ package prover import ( + "fmt" + + "github.com/filecoin-project/go-state-types/abi" + "github.com/filecoin-project/venus/venus-shared/actors/builtin" "github.com/ipfs-force-community/damocles/damocles-manager/core" + "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" ) var _ core.Prover = Prover @@ -13,3 +19,69 @@ var Prover prover type ( SortedPrivateSectorInfo = core.SortedPrivateSectorInfo ) + +type ExtDoWindowPoStFunc func(stage.WindowPoSt) (stage.WindowPoStOutput, error) + +func ExtGenerateWindowPoSt(minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) func(ExtDoWindowPoStFunc) ([]builtin.PoStProof, []abi.SectorID, error) { + randomness[31] &= 0x3f + return func(doWork ExtDoWindowPoStFunc) ([]builtin.PoStProof, []abi.SectorID, error) { + sectorInners := sectors.Values() + if len(sectorInners) == 0 { + return nil, nil, nil + } + + // build stage.WindowPoSt + proofType := sectorInners[0].PoStProofType + data := stage.WindowPoSt{ + MinerID: minerID, + ProofType: stage.ProofType2String(proofType), + } + copy(data.Seed[:], randomness[:]) + + for i := range sectorInners { + inner := sectorInners[i] + + if pt := inner.PoStProofType; pt != proofType { + return nil, nil, fmt.Errorf("proof type not match for sector %d of miner %d: want %s, got %s", inner.SectorNumber, minerID, stage.ProofType2String(proofType), stage.ProofType2String(pt)) + } + + commR, err := util.CID2ReplicaCommitment(inner.SealedCID) + if err != nil { + return nil, nil, fmt.Errorf("invalid selaed cid %s for sector %d of miner %d: %w", inner.SealedCID, inner.SectorNumber, minerID, err) + } + + data.Replicas = append(data.Replicas, stage.PoStReplicaInfo{ + SectorID: inner.SectorNumber, + CommR: commR, + CacheDir: inner.CacheDirPath, + SealedFile: inner.SealedSectorPath, + }) + } + + output, err := doWork(data) + if err != nil { + return nil, nil, err + } + + if faultCount := len(output.Faults); faultCount != 0 { + faults := make([]abi.SectorID, faultCount) + for fi := range output.Faults { + faults[fi] = abi.SectorID{ + Miner: minerID, + Number: output.Faults[fi], + } + } + + return nil, faults, fmt.Errorf("got %d fault sectors", faultCount) + } + + proofs := make([]builtin.PoStProof, len(output.Proofs)) + for pi := range output.Proofs { + proofs[pi] = builtin.PoStProof{ + PoStProof: proofType, + ProofBytes: output.Proofs[pi], + } + } + return proofs, nil, nil + } +} diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go new file mode 100644 index 000000000..694830398 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -0,0 +1,180 @@ +package worker + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/filecoin-project/go-state-types/abi" + "github.com/filecoin-project/lotus/chain/actors/builtin" + "github.com/ipfs-force-community/damocles/damocles-manager/core" + "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/logging" +) + +var log = logging.New("worker prover") + +type workerProver struct { + taskMgr TaskManager + + inflightTasks map[string][]chan<- struct { + output *stage.WindowPoStOutput + err string + } + inflightTasksLock *sync.Mutex + + retryFailedTasksInterval time.Duration + taskMaxTry uint32 + heartbeatTimeout time.Duration + + cleanupExpiredTasksJobInterval time.Duration + taskLifetime time.Duration +} + +func NewProver(taskMgr TaskManager) core.Prover { + return &workerProver{ + taskMgr: taskMgr, + inflightTasks: make(map[string][]chan<- struct { + output *stage.WindowPoStOutput + err string + }), + inflightTasksLock: &sync.Mutex{}, + + // TODO(0x5459): make them configurable + retryFailedTasksInterval: 10 * time.Second, + taskMaxTry: 2, + heartbeatTimeout: 15 * time.Second, + cleanupExpiredTasksJobInterval: 30 * time.Minute, + taskLifetime: 25 * time.Hour, + } +} + +func (p *workerProver) StartJob(ctx context.Context) { + go p.runNotifyTaskDoneJob(ctx) + go p.runRetryFailedTasksJob(ctx) + go p.runCleanupExpiredTasksJob(ctx) +} + +func (p *workerProver) runNotifyTaskDoneJob(ctx context.Context) { + ticker := time.NewTicker(3 * time.Second) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + p.inflightTasksLock.Lock() + inflightTaskIDs := make([]string, 0, len(p.inflightTasks)) + for taskID := range p.inflightTasks { + inflightTaskIDs = append(inflightTaskIDs, taskID) + } + p.inflightTasksLock.Unlock() + + finishedTasks, err := p.taskMgr.ListByTaskIDs(ctx, TaskFinished, inflightTaskIDs...) + if err != nil { + log.Errorf("failed to list tasks: %s", err) + } + + p.inflightTasksLock.Lock() + for _, task := range finishedTasks { + chs, ok := p.inflightTasks[task.ID] + if !ok { + continue + } + if !task.Finished(p.taskMaxTry) { + continue + } + for _, ch := range chs { + ch <- struct { + output *stage.WindowPoStOutput + err string + }{ + output: task.Output, + err: task.ErrorReason, + } + } + } + p.inflightTasksLock.Unlock() + } + } +} + +func (p *workerProver) runRetryFailedTasksJob(ctx context.Context) { + ticker := time.NewTicker(p.retryFailedTasksInterval) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := p.taskMgr.MakeTasksDie(ctx, p.heartbeatTimeout, 128); err != nil { + log.Errorf("failed to make tasks die: %s", err) + } + if err := p.taskMgr.RetryFailedTasks(ctx, p.taskMaxTry, 128); err != nil { + log.Errorf("failed to retry failed tasks: %s", err) + } + } + } +} + +func (p *workerProver) runCleanupExpiredTasksJob(ctx context.Context) { + ticker := time.NewTicker(p.cleanupExpiredTasksJobInterval) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := p.taskMgr.CleanupExpiredTasks(ctx, p.taskLifetime, 128); err != nil { + log.Errorf("failed to cleanup expired tasks: %s", err) + } + } + } +} + +func (p *workerProver) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { + return prover.Prover.AggregateSealProofs(ctx, aggregateInfo, proofs) +} + +func (p *workerProver) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectors core.SortedPrivateSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { + + return prover.ExtGenerateWindowPoSt(minerID, sectors, randomness)(func(input stage.WindowPoSt) (stage.WindowPoStOutput, error) { + task, err := p.taskMgr.Create(ctx, input) + if err != nil { + return stage.WindowPoStOutput{}, fmt.Errorf("create wdPoSt task: %w", err) + } + + ch := make(chan struct { + output *stage.WindowPoStOutput + err string + }, 1) + + p.inflightTasksLock.Lock() + p.inflightTasks[task.ID] = append(p.inflightTasks[task.ID], ch) + p.inflightTasksLock.Unlock() + + result, ok := <-ch + if !ok { + return stage.WindowPoStOutput{}, fmt.Errorf("wdPoSt result channel was closed unexpectedly") + } + if result.err != "" { + return stage.WindowPoStOutput{}, fmt.Errorf("error from worker: %s", result.err) + } + return *result.output, nil + }) +} + +func (p *workerProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectors core.SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { + return prover.Prover.GenerateWinningPoSt(ctx, minerID, sectors, randomness) +} + +func (p *workerProver) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { + return prover.Prover.GeneratePoStFallbackSectorChallenges(ctx, proofType, minerID, randomness, sectorIds) +} + +func (p *workerProver) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { + return prover.Prover.GenerateSingleVanillaProof(ctx, replica, challenges) +} + +func (p *workerProver) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { + return prover.Prover.GenerateWinningPoStWithVanilla(ctx, proofType, minerID, randomness, proofs) +} diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr.go b/damocles-manager/modules/impl/prover/worker/task_mgr.go index 368be5f17..9a089a9a0 100644 --- a/damocles-manager/modules/impl/prover/worker/task_mgr.go +++ b/damocles-manager/modules/impl/prover/worker/task_mgr.go @@ -32,6 +32,18 @@ type Task struct { UpdatedAt uint64 } +func (t *Task) Finished(maxTry uint32) bool { + if t.FinishedAt == 0 { + return false + } + + if t.ErrorReason != "" && t.tryNum < maxTry { + return false + } + + return true +} + type AllocatedTask struct { ID string Input stage.WindowPoSt @@ -49,7 +61,7 @@ type TaskManager interface { RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error } -func genTaskID(rawInput []byte) string { +func GenTaskID(rawInput []byte) string { b := make([]byte, 8) binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) return base64.URLEncoding.EncodeToString(b) diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go index c974d4db5..439d06a31 100644 --- a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go +++ b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go @@ -11,29 +11,27 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" ) -var _ TaskManager = (*kvTaskManager)(nil) - -func NewKVTaskStore(kv kvstore.ExtendKV) *kvTaskManager { +func NewKVTaskStore(kv kvstore.KVExt) TaskManager { return &kvTaskManager{ kv: kv, } } type kvTaskManager struct { - kv kvstore.ExtendKV + kv kvstore.KVExt } // TODO(0x5459): Consider putting `txn` into context? -func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.ExtendTxn, state TaskState, limit uint32, f func(*Task) bool) (tasks []*Task, err error) { +func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state TaskState, limit uint32, f func(*Task) bool) (tasks []*Task, err error) { var it kvstore.Iter it, err = txn.Scan([]byte(makeWdPoStPrefix(state))) if err != nil { return } defer it.Close() - var task Task for it.Next() && len(tasks) <= int(limit) { - if err = it.View(ctx, kvstore.LoadJson(&task)); err != nil { + var task Task + if err = it.View(ctx, kvstore.LoadJSON(&task)); err != nil { return } if f(&task) { @@ -44,7 +42,7 @@ func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.ExtendTxn, stat } func (tm *kvTaskManager) All(ctx context.Context, state TaskState, limit uint32, filter func(*Task) bool) (tasks []*Task, err error) { - err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { tasks, err = tm.filter(ctx, txn, state, limit, filter) return err }) @@ -53,10 +51,10 @@ func (tm *kvTaskManager) All(ctx context.Context, state TaskState, limit uint32, func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state TaskState, taskIDs ...string) ([]*Task, error) { tasks := make([]*Task, 0, len(taskIDs)) - err := tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err := tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { for _, taskID := range taskIDs { var task Task - err := txn.Peek(kvstore.Key(makeWdPoStKey(state, taskID)), kvstore.LoadJson(&task)) + err := txn.Peek(kvstore.Key(makeWdPoStKey(state, taskID)), kvstore.LoadJSON(&task)) if errors.Is(err, kvstore.ErrKeyNotFound) { continue } @@ -75,15 +73,15 @@ func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*T taskID string task *Task ) - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { rawInput, err := json.Marshal(input) if err != nil { return err } - taskID = genTaskID(rawInput) + taskID = GenTaskID(rawInput) // check if task exists err = txn.PeekAny( - kvstore.LoadJson(task), + kvstore.LoadJSON(task), kvstore.Key(makeWdPoStKey(TaskReadyToRun, taskID)), kvstore.Key(makeWdPoStKey(TaskRunning, taskID)), kvstore.Key(makeWdPoStKey(TaskFinished, taskID)), @@ -121,7 +119,7 @@ func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*T func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName string) (allocatedTasks []AllocatedTask, err error) { var readyToRun []*Task - err = tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err = tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { readyToRun, err = tm.filter(ctx, txn, TaskReadyToRun, n, func(t *Task) bool { return true }) if err != nil { return err @@ -157,10 +155,10 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName s } func (tm *kvTaskManager) Heartbeat(ctx context.Context, taskIDs []string, workerName string) error { - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { for _, taskID := range taskIDs { var task Task - if err := txn.Peek([]byte(makeWdPoStKey(TaskRunning, taskID)), kvstore.LoadJson(&task)); err != nil { + if err := txn.Peek([]byte(makeWdPoStKey(TaskRunning, taskID)), kvstore.LoadJSON(&task)); err != nil { return err } now := uint64(time.Now().Unix()) @@ -180,10 +178,10 @@ func (tm *kvTaskManager) Heartbeat(ctx context.Context, taskIDs []string, worker } func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { runningKey := []byte(makeWdPoStKey(TaskRunning, taskID)) var task Task - if err := txn.Peek(runningKey, kvstore.LoadJson(&task)); err != nil { + if err := txn.Peek(runningKey, kvstore.LoadJSON(&task)); err != nil { return err } if err := txn.Del(runningKey); err != nil { @@ -207,13 +205,13 @@ func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stag return err } -func (ts *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time.Duration, limit uint32) error { +func (tm *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time.Duration, limit uint32) error { var shouldDead []*Task shouldDeadTime := time.Now().Add(-heartbeatTimeout) - err := ts.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { var err error - shouldDead, err = ts.filter(ctx, txn, TaskRunning, limit, func(t *Task) bool { + shouldDead, err = tm.filter(ctx, txn, TaskRunning, limit, func(t *Task) bool { return t.HeartbeatAt > 0 && time.Unix(int64(t.HeartbeatAt), 0).Before(shouldDeadTime) }) if err != nil { @@ -225,6 +223,7 @@ func (ts *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time return err } task.FinishedAt = now + task.Output = nil task.ErrorReason = "heartbeat timeout" task.UpdatedAt = now if err := txn.PutJson([]byte(makeWdPoStKey(TaskFinished, task.ID)), task); err != nil { @@ -241,7 +240,7 @@ func (tm *kvTaskManager) CleanupExpiredTasks(ctx context.Context, taskLifetime t var shouldClean []*Task shouldCleanTime := time.Now().Add(-taskLifetime) - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { var err error shouldClean, err = tm.filter(ctx, txn, TaskFinished, limit, func(t *Task) bool { return time.Unix(int64(t.CreatedAt), 0).Before(shouldCleanTime) @@ -267,7 +266,7 @@ func (tm *kvTaskManager) CleanupExpiredTasks(ctx context.Context, taskLifetime t func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error { var shouldRetry []*Task - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.ExtendTxn) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { var err error shouldRetry, err = tm.filter(ctx, txn, TaskFinished, limit, func(t *Task) bool { return len(t.ErrorReason) != 0 && t.tryNum > maxTry @@ -278,6 +277,7 @@ func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uin now := uint64(time.Now().Unix()) for _, task := range shouldRetry { task.ErrorReason = "" + task.Output = nil task.StartedAt = 0 task.FinishedAt = 0 task.UpdatedAt = now diff --git a/damocles-manager/modules/impl/sectors/state_mgr.go b/damocles-manager/modules/impl/sectors/state_mgr.go index 17824c923..3ca921888 100644 --- a/damocles-manager/modules/impl/sectors/state_mgr.go +++ b/damocles-manager/modules/impl/sectors/state_mgr.go @@ -209,8 +209,8 @@ func (sm *StateManager) InitWith(ctx context.Context, sectors []*core.AllocatedS return fmt.Errorf("init: %w", err) } - kvExtend := kvstore.NewExtendKV(kv) - err = kvExtend.MustNoConflict(func() error { + kvExt := kvstore.NewKVExt(kv) + err = kvExt.MustNoConflict(func() error { return kv.Update(ctx, func(txn kvstore.Txn) error { for _, sector := range sectors { state := core.SectorState{ diff --git a/damocles-manager/pkg/kvstore/kvstore.go b/damocles-manager/pkg/kvstore/kvstore.go index 1fa32ea85..882874e01 100644 --- a/damocles-manager/pkg/kvstore/kvstore.go +++ b/damocles-manager/pkg/kvstore/kvstore.go @@ -29,23 +29,23 @@ type ( Txn = pluginkvstore.Txn ) -var LoadJson = func(target any) func(Val) error { +var LoadJSON = func(target any) func(Val) error { return func(data Val) error { return json.Unmarshal(data, target) } } -func NewExtendKV(kvStore KVStore) *ExtendKV { - return &ExtendKV{ +func NewKVExt(kvStore KVStore) *KVExt { + return &KVExt{ KVStore: kvStore, } } -type ExtendKV struct { +type KVExt struct { KVStore } -func (kv *ExtendKV) MustNoConflict(f func() error) error { +func (kv *KVExt) MustNoConflict(f func() error) error { if kv.NeedRetryTransactions() { for { err := f() @@ -58,27 +58,27 @@ func (kv *ExtendKV) MustNoConflict(f func() error) error { } } -func (kv *ExtendKV) UpdateMustNoConflict(ctx context.Context, f func(txn ExtendTxn) error) error { +func (kv *KVExt) UpdateMustNoConflict(ctx context.Context, f func(txn TxnExt) error) error { return kv.MustNoConflict(func() error { return kv.Update(ctx, func(t Txn) error { - return f(ExtendTxn{Txn: t}) + return f(TxnExt{Txn: t}) }) }) } -func (kv *ExtendKV) ViewMustNoConflict(ctx context.Context, f func(txn ExtendTxn) error) error { +func (kv *KVExt) ViewMustNoConflict(ctx context.Context, f func(txn TxnExt) error) error { return kv.MustNoConflict(func() error { return kv.View(ctx, func(t Txn) error { - return f(ExtendTxn{Txn: t}) + return f(TxnExt{Txn: t}) }) }) } -type ExtendTxn struct { +type TxnExt struct { Txn } -func (et ExtendTxn) PeekAny(f func(Val) error, keys ...Key) error { +func (et TxnExt) PeekAny(f func(Val) error, keys ...Key) error { for _, k := range keys { err := et.Peek(k, f) if errors.Is(err, ErrKeyNotFound) { @@ -89,7 +89,7 @@ func (et ExtendTxn) PeekAny(f func(Val) error, keys ...Key) error { return ErrKeyNotFound } -func (et ExtendTxn) PutJson(k Key, v any) error { +func (et TxnExt) PutJson(k Key, v any) error { b, err := json.Marshal(v) if err != nil { return err From b2f1d3d511341a868516152785761e3e986dee9b Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Mon, 3 Jul 2023 14:07:38 +0800 Subject: [PATCH 03/18] feat: wdpost rpc --- .../cmd/damocles-manager/internal/global.go | 3 +- .../internal/util_sealer_actor.go | 2 +- .../internal/util_sealer_proving.go | 6 +- .../internal/util_sealer_sectors.go | 44 +-- .../internal/util_sealer_snap.go | 6 +- .../damocles-manager/internal/util_storage.go | 4 +- .../damocles-manager/internal/util_worker.go | 8 +- .../cmd/damocles-manager/server.go | 47 ++- damocles-manager/core/api.go | 31 +- damocles-manager/core/client.go | 217 +------------ damocles-manager/core/client_gen.go | 291 ++++++++++++++++++ damocles-manager/core/gen.go | 244 +++++++++++++++ damocles-manager/core/types_wdpost.go | 60 ++++ damocles-manager/dep/sealer.go | 12 +- damocles-manager/dep/sealer_constructor.go | 50 +-- .../modules/impl/prover/worker/prover.go | 15 +- .../modules/impl/prover/worker/rpc.go | 38 +++ .../modules/impl/prover/worker/task_mgr.go | 68 ---- .../modules/impl/prover/worker/task_mgr_kv.go | 151 ++++++--- .../impl/prover/worker/task_mgr_kv_test.go | 18 ++ .../modules/impl/sectors/indexer_proxy.go | 6 +- damocles-manager/pkg/kvstore/kvstore.go | 6 +- 22 files changed, 913 insertions(+), 414 deletions(-) create mode 100644 damocles-manager/core/client_gen.go create mode 100644 damocles-manager/core/gen.go create mode 100644 damocles-manager/core/types_wdpost.go create mode 100644 damocles-manager/modules/impl/prover/worker/rpc.go delete mode 100644 damocles-manager/modules/impl/prover/worker/task_mgr.go create mode 100644 damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go diff --git a/damocles-manager/cmd/damocles-manager/internal/global.go b/damocles-manager/cmd/damocles-manager/internal/global.go index d6d9b7935..f7ac1064e 100644 --- a/damocles-manager/cmd/damocles-manager/internal/global.go +++ b/damocles-manager/cmd/damocles-manager/internal/global.go @@ -117,8 +117,7 @@ type APIClient struct { Chain chain.API Messager messager.API Market market.API - Sealer core.SealerCliClient - Miner core.MinerAPIClient + Damocles *core.APIClient } func extractAPI(cctx *cli.Context, target ...interface{}) (*APIClient, context.Context, stopper, error) { diff --git a/damocles-manager/cmd/damocles-manager/internal/util_sealer_actor.go b/damocles-manager/cmd/damocles-manager/internal/util_sealer_actor.go index dc9fb3ab0..c0f60b795 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_sealer_actor.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_sealer_actor.go @@ -393,7 +393,7 @@ var utilSealerActorControlList = &cli.Command{ if err != nil { return fmt.Errorf("invalid miner addr '%s': %w", maddr, err) } - minerConfig, err := api.Miner.GetMinerConfig(ctx, abi.ActorID(mid)) + minerConfig, err := api.Damocles.GetMinerConfig(ctx, abi.ActorID(mid)) if err != nil { return fmt.Errorf("get miner config: %w", err) } diff --git a/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go b/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go index e2baaecf7..53430f975 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go @@ -627,7 +627,7 @@ var utilSealerProvingCheckProvableCmd = &cli.Command{ return fmt.Errorf("invalid seal proof type %d: %w", tocheck[0].SealProof, err) } - bad, err := api.Sealer.CheckProvable(ctx, abi.ActorID(mid), postProofType, tocheck, slow, stateCheck) + bad, err := api.Damocles.CheckProvable(ctx, abi.ActorID(mid), postProofType, tocheck, slow, stateCheck) if err != nil { return err } @@ -752,7 +752,7 @@ var utilSealerProvingSimulateWdPoStCmd = &cli.Command{ return fmt.Errorf("convert to winning post proof: %w", err) } - err = api.Sealer.SimulateWdPoSt(ctx, maddr, ppt, proofSectors, rand) + err = api.Damocles.SimulateWdPoSt(ctx, maddr, ppt, proofSectors, rand) if err != nil { return err } @@ -792,7 +792,7 @@ var utilSealerProvingSectorInfoCmd = &cli.Command{ slog := mlog.With("num", num) - info, err := api.Sealer.ProvingSectorInfo(actx, abi.SectorID{ + info, err := api.Damocles.ProvingSectorInfo(actx, abi.SectorID{ Miner: mid, Number: abi.SectorNumber(num), }) diff --git a/damocles-manager/cmd/damocles-manager/internal/util_sealer_sectors.go b/damocles-manager/cmd/damocles-manager/internal/util_sealer_sectors.go index 10ca04440..36b95e2d1 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_sealer_sectors.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_sealer_sectors.go @@ -168,7 +168,7 @@ var utilSealerSectorsListCmd = &cli.Command{ defer stop() - states, err := cli.Sealer.ListSectors(gctx, extractListWorkerState(cctx), core.SectorWorkerJobAll) + states, err := cli.Damocles.ListSectors(gctx, extractListWorkerState(cctx), core.SectorWorkerJobAll) if err != nil { return err } @@ -298,7 +298,7 @@ var utilSealerSectorsRestoreCmd = &cli.Command{ defer stop() - _, err = cli.Sealer.RestoreSector(gctx, abi.SectorID{ + _, err = cli.Damocles.RestoreSector(gctx, abi.SectorID{ Miner: miner, Number: abi.SectorNumber(sectorNum), }, cctx.Bool("force")) @@ -462,7 +462,7 @@ var utilSealerSectorsExpiredCmd = &cli.Command{ toCheck := bitfield.New() toCheckSectors := make(map[abi.SectorNumber]*core.SectorState) { - sectors, err := extAPI.Sealer.ListSectors(ctx, core.WorkerOffline, core.SectorWorkerJobAll) + sectors, err := extAPI.Damocles.ListSectors(ctx, core.WorkerOffline, core.SectorWorkerJobAll) if err != nil { return fmt.Errorf("getting sector list: %w", err) } @@ -577,7 +577,7 @@ var utilSealerSectorsExpiredCmd = &cli.Command{ for _, number := range toRemove { fmt.Printf("Removing sector\t%s:\t", color.YellowString("%d", number)) - err = extAPI.Sealer.RemoveSector(ctx, abi.SectorID{Miner: abi.ActorID(actor), Number: number}) + err = extAPI.Damocles.RemoveSector(ctx, abi.SectorID{Miner: abi.ActorID(actor), Number: number}) if err != nil { color.Red("ERROR: %s\n", err.Error()) } else { @@ -1198,7 +1198,7 @@ var utilSealerSectorsTerminateCmd = &cli.Command{ } actor := cctx.Uint64("actor") - resp, err := cli.Sealer.TerminateSector(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) + resp, err := cli.Damocles.TerminateSector(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) if err != nil { return err } @@ -1230,7 +1230,7 @@ var utilSealerSectorsTerminateQueryCmd = &cli.Command{ } actor := cctx.Uint64("actor") - resp, err := cli.Sealer.PollTerminateSectorState(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) + resp, err := cli.Damocles.PollTerminateSectorState(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) if err != nil { return err } @@ -1278,7 +1278,7 @@ var utilSealerSectorsRemoveCmd = &cli.Command{ } actor := cctx.Uint64("actor") - err = cli.Sealer.RemoveSector(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) + err = cli.Damocles.RemoveSector(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) if err != nil { return err } @@ -1321,7 +1321,7 @@ var utilSealerSectorsFinalizeCmd = &cli.Command{ } actor := cctx.Uint64("actor") - err = cli.Sealer.FinalizeSector(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) + err = cli.Damocles.FinalizeSector(gctx, abi.SectorID{Miner: abi.ActorID(actor), Number: abi.SectorNumber(id)}) if err != nil { return err } @@ -1369,7 +1369,7 @@ var utilSealerSectorsStateCmd = &cli.Command{ Number: sectorNumber, } - state, err := cli.Sealer.FindSectorInAllStates(gctx, sid) + state, err := cli.Damocles.FindSectorInAllStates(gctx, sid) if err != nil { return RPCCallError("FindSectorInAllStates", err) } @@ -1498,7 +1498,7 @@ var utilSealerSectorsFindDealCmd = &cli.Command{ defer stop() - sectors, err := cli.Sealer.FindSectorsWithDeal(gctx, extractListWorkerState(cctx), abi.DealID(dealID)) + sectors, err := cli.Damocles.FindSectorsWithDeal(gctx, extractListWorkerState(cctx), abi.DealID(dealID)) if err != nil { return RPCCallError("FindSectorsWithDeal", err) } @@ -1551,7 +1551,7 @@ var utilSealerSectorsResendPreCommitCmd = &cli.Command{ Number: sectorNumber, } - state, err := cli.Sealer.FindSector(gctx, core.WorkerOnline, sid) + state, err := cli.Damocles.FindSector(gctx, core.WorkerOnline, sid) if err != nil { return RPCCallError("FindSector", err) } @@ -1573,7 +1573,7 @@ var utilSealerSectorsResendPreCommitCmd = &cli.Command{ return fmt.Errorf("convert to pre commit on chain info: %w", err) } - resp, err := cli.Sealer.SubmitPreCommit(gctx, core.AllocatedSector{ + resp, err := cli.Damocles.SubmitPreCommit(gctx, core.AllocatedSector{ ID: sid, ProofType: state.SectorType, }, onChainInfo, true) @@ -1626,7 +1626,7 @@ var utilSealerSectorsResendProveCommitCmd = &cli.Command{ Number: sectorNumber, } - state, err := cli.Sealer.FindSector(gctx, core.WorkerOnline, sid) + state, err := cli.Damocles.FindSector(gctx, core.WorkerOnline, sid) if err != nil { return RPCCallError("FindSector", err) } @@ -1639,7 +1639,7 @@ var utilSealerSectorsResendProveCommitCmd = &cli.Command{ return fmt.Errorf("sector is still being marked as 'Need To Be Send' in the state machine") } - resp, err := cli.Sealer.SubmitProof(gctx, sid, *state.Proof, true) + resp, err := cli.Damocles.SubmitProof(gctx, sid, *state.Proof, true) if err != nil { return RPCCallError("SubmitProof", err) @@ -1739,7 +1739,7 @@ var utilSealerSectorsImportCmd = &cli.Command{ continue } - imported, err := cli.Sealer.ImportSector(gctx, core.WorkerOffline, state, override) + imported, err := cli.Damocles.ImportSector(gctx, core.WorkerOffline, state, override) if err != nil { slog.Errorf("import failed: %s", err) continue @@ -1820,7 +1820,7 @@ var utilSealerSectorsExportMetadataCmd = &cli.Command{ } defer stop() - states, err := cli.Sealer.ListSectors(gctx, core.WorkerOffline, core.SectorWorkerJobAll) + states, err := cli.Damocles.ListSectors(gctx, core.WorkerOffline, core.SectorWorkerJobAll) if err != nil { return err } @@ -1972,7 +1972,7 @@ var utilSealerSectorsExportFilesCmd = &cli.Command{ } defer stop() - states, err := cli.Sealer.ListSectors(gctx, core.WorkerOffline, core.SectorWorkerJobAll) + states, err := cli.Damocles.ListSectors(gctx, core.WorkerOffline, core.SectorWorkerJobAll) if err != nil { return err } @@ -1991,7 +1991,7 @@ var utilSealerSectorsExportFilesCmd = &cli.Command{ continue } - loc, err := cli.Sealer.ProvingSectorInfo(ctx, state.ID) + loc, err := cli.Damocles.ProvingSectorInfo(ctx, state.ID) if err != nil { fmt.Fprintf(os.Stdout, "find sector %v location: %s\n", state.ID.Number, err) failCounts++ @@ -2355,7 +2355,7 @@ var utilSealerSectorsRebuildCmd = &cli.Command{ defer stop() - _, err = cli.Sealer.SectorSetForRebuild(gctx, abi.SectorID{ + _, err = cli.Damocles.SectorSetForRebuild(gctx, abi.SectorID{ Miner: miner, Number: abi.SectorNumber(sectorNum), }, core.RebuildOptions{ @@ -2449,14 +2449,14 @@ var utilSealerSectorsUnsealCmd = &cli.Command{ Miner: miner, Number: sector, } - sectorState, err = cli.Sealer.FindSectorInAllStates(gctx, sectorID) + sectorState, err = cli.Damocles.FindSectorInAllStates(gctx, sectorID) if err != nil { return fmt.Errorf("get sector info failed: %w", err) } } else if cctx.IsSet("actor") || cctx.IsSet("sector") { return fmt.Errorf("flag \"--actor\" and \"--sector\" must be set together") } else { - sector, err := cli.Sealer.FindSectorWithPiece(gctx, core.WorkerOffline, pieceCid) + sector, err := cli.Damocles.FindSectorWithPiece(gctx, core.WorkerOffline, pieceCid) if err != nil { return fmt.Errorf("find sector with piece: %w", err) } @@ -2564,7 +2564,7 @@ var utilSealerSectorsUnsealCmd = &cli.Command{ } } else { - stream, err := cli.Sealer.UnsealPiece(gctx, sectorID, pieceCid, types.UnpaddedByteIndex(offset), size, dest) + stream, err := cli.Damocles.UnsealPiece(gctx, sectorID, pieceCid, types.UnpaddedByteIndex(offset), size, dest) if err != nil { return fmt.Errorf("set task for unseal failed: %w", err) } diff --git a/damocles-manager/cmd/damocles-manager/internal/util_sealer_snap.go b/damocles-manager/cmd/damocles-manager/internal/util_sealer_snap.go index a77995872..1885c15ce 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_sealer_snap.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_sealer_snap.go @@ -48,7 +48,7 @@ var utilSealerSnapFetchCmd = &cli.Command{ defer stop() - res, err := api.Sealer.SnapUpPreFetch(gctx, mid, &deadidx) + res, err := api.Damocles.SnapUpPreFetch(gctx, mid, &deadidx) if err != nil { return RPCCallError("SnapPreFetch", err) } @@ -88,7 +88,7 @@ var utilSealerSnapCandidatesCmd = &cli.Command{ defer stop() - candidates, err := api.Sealer.SnapUpCandidates(gctx, mid) + candidates, err := api.Damocles.SnapUpCandidates(gctx, mid) if err != nil { return RPCCallError("SnapPreFetch", err) } @@ -150,7 +150,7 @@ var utilSealerSnapCancelCommitmentCmd = &cli.Command{ defer stop() - err = api.Sealer.SnapUpCancelCommitment(gctx, abi.SectorID{ + err = api.Damocles.SnapUpCancelCommitment(gctx, abi.SectorID{ Miner: mid, Number: num, }) diff --git a/damocles-manager/cmd/damocles-manager/internal/util_storage.go b/damocles-manager/cmd/damocles-manager/internal/util_storage.go index 636441f6d..764aa3e03 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_storage.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_storage.go @@ -426,7 +426,7 @@ var utilStorageListCmd = &cli.Command{ } defer astop() - details, err := api.Sealer.StoreList(actx) + details, err := api.Damocles.StoreList(actx) if err != nil { return RPCCallError("StoreList", err) } @@ -491,7 +491,7 @@ var utilStorageReleaseReservedCmd = &cli.Command{ Miner: minerID, Number: num, } - done, err := api.Sealer.StoreReleaseReserved(actx, sid) + done, err := api.Damocles.StoreReleaseReserved(actx, sid) if err != nil { return RPCCallError("StoreReleaseReserved", err) } diff --git a/damocles-manager/cmd/damocles-manager/internal/util_worker.go b/damocles-manager/cmd/damocles-manager/internal/util_worker.go index 639be445f..ebb1bbcfc 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_worker.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_worker.go @@ -45,7 +45,7 @@ var utilWorkerListCmd = &cli.Command{ } defer stopper() - pinfos, err := a.Sealer.WorkerPingInfoList(actx) + pinfos, err := a.Damocles.WorkerPingInfoList(actx) if err != nil { return RPCCallError("WorkerPingInfoList", err) } @@ -97,7 +97,7 @@ var utilWorkerRemoveCmd = &cli.Command{ } defer stopper() - workerInfo, err := a.Sealer.WorkerGetPingInfo(actx, name) + workerInfo, err := a.Damocles.WorkerGetPingInfo(actx, name) if err != nil { return RPCCallError("WorkerGetPingInfo", err) } @@ -106,7 +106,7 @@ var utilWorkerRemoveCmd = &cli.Command{ return fmt.Errorf("worker info not found. please make sure the instance name is correct: %s", name) } - if err = a.Sealer.WorkerPingInfoRemove(actx, name); err != nil { + if err = a.Damocles.WorkerPingInfoRemove(actx, name); err != nil { return err } fmt.Printf("'%s' removed\n", name) @@ -273,7 +273,7 @@ func resolveWorkerDest(ctx context.Context, a *APIClient, name string) (string, var info *core.WorkerPingInfo var err error if a != nil { - info, err = a.Sealer.WorkerGetPingInfo(ctx, name) + info, err = a.Damocles.WorkerGetPingInfo(ctx, name) if err != nil { return "", RPCCallError("WorkerGetPingInfo", err) } diff --git a/damocles-manager/cmd/damocles-manager/server.go b/damocles-manager/cmd/damocles-manager/server.go index 6b2ab8d8c..53de27ea0 100644 --- a/damocles-manager/cmd/damocles-manager/server.go +++ b/damocles-manager/cmd/damocles-manager/server.go @@ -15,11 +15,31 @@ import ( managerplugin "github.com/ipfs-force-community/damocles/manager-plugin" ) -func NewAPIService(sealerAPI core.SealerAPI, minerAPI core.MinerAPI, plugins *managerplugin.LoadedPlugins) *APIService { +func NewAPIService( + sealerAPI core.SealerAPI, + sealerCliAPI core.SealerCliAPI, + randomnessAPI core.RandomnessAPI, + minerAPI core.MinerAPI, + workerWdPoStAPI core.WorkerWdPoStAPI, + plugins *managerplugin.LoadedPlugins, +) *APIService { + type coreAPI struct { + core.SealerAPI + core.SealerCliAPI + core.RandomnessAPI + core.MinerAPI + core.WorkerWdPoStAPI + } + return &APIService{ - sealerAPI: sealerAPI, - minerAPI: minerAPI, - plugins: plugins, + coreAPI: &coreAPI{ + SealerAPI: sealerAPI, + SealerCliAPI: sealerCliAPI, + RandomnessAPI: randomnessAPI, + MinerAPI: minerAPI, + WorkerWdPoStAPI: workerWdPoStAPI, + }, + plugins: plugins, } } @@ -29,23 +49,18 @@ type handler struct { } type APIService struct { - sealerAPI core.SealerAPI - minerAPI core.MinerAPI - plugins *managerplugin.LoadedPlugins + coreAPI core.API + plugins *managerplugin.LoadedPlugins } -func (api *APIService) handlers() []handler { +func (s *APIService) handlers() []handler { handlers := make([]handler, 0, 2) handlers = append(handlers, handler{ - namespace: core.SealerAPINamespace, - hdl: api.sealerAPI, - }) - handlers = append(handlers, handler{ - namespace: core.MinerAPINamespace, - hdl: api.minerAPI, + namespace: core.APINamespace, + hdl: s.coreAPI, }) - if api.plugins != nil { - _ = api.plugins.Foreach(managerplugin.RegisterJsonRpc, func(plugin *managerplugin.Plugin) error { + if s.plugins != nil { + _ = s.plugins.Foreach(managerplugin.RegisterJsonRpc, func(plugin *managerplugin.Plugin) error { m := managerplugin.DeclareRegisterJsonRpcManifest(plugin.Manifest) namespace, hdl := m.Handler() log.Infof("register json rpc handler by plugin(%s). namespace: '%s'", plugin.Name, namespace) diff --git a/damocles-manager/core/api.go b/damocles-manager/core/api.go index 44b7cb800..3f57a86d6 100644 --- a/damocles-manager/core/api.go +++ b/damocles-manager/core/api.go @@ -11,20 +11,30 @@ import ( "github.com/filecoin-project/venus/venus-shared/actors/builtin" "github.com/filecoin-project/venus/venus-shared/types" "github.com/ipfs-force-community/damocles/damocles-manager/modules" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" ) +//go:generate go run gen.go -interface=SealerAPI,SealerCliAPI,RandomnessAPI,MinerAPI,WorkerWdPoStAPI + const ( - // TODO: The sealerAPI namespace is Venus due to historical reasons, + // TODO: The APINamespace is Venus due to historical reasons, // and we should consider changing it to a more appropriate name in future versions - SealerAPINamespace = "Venus" - MinerAPINamespace = "Damocles.miner" - MajorVersion = 0 + APINamespace = "Venus" + MajorVersion = 0 ) var Empty Meta type Meta *struct{} +type API interface { + SealerAPI + SealerCliAPI + RandomnessAPI + MinerAPI + WorkerWdPoStAPI +} + type SealerAPI interface { AllocateSector(context.Context, AllocateSectorSpec) (*AllocatedSector, error) @@ -73,9 +83,6 @@ type SealerAPI interface { AllocateUnsealSector(ctx context.Context, spec AllocateSectorSpec) (*SectorUnsealInfo, error) AchieveUnsealSector(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid, errInfo string) (Meta, error) AcquireUnsealDest(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid) ([]string, error) - - // utils - SealerCliAPI } type SealerCliAPI interface { @@ -109,6 +116,8 @@ type SealerCliAPI interface { WorkerPingInfoList(ctx context.Context) ([]WorkerPingInfo, error) + WorkerPingInfoRemove(ctx context.Context, name string) error + SectorIndexerFind(ctx context.Context, indexType SectorIndexType, sid abi.SectorID) (SectorIndexLocation, error) TerminateSector(context.Context, abi.SectorID) (SubmitTerminateResp, error) @@ -142,3 +151,11 @@ type MinerAPI interface { GetInfo(context.Context, abi.ActorID) (*MinerInfo, error) GetMinerConfig(context.Context, abi.ActorID) (*modules.MinerConfig, error) } + +type WorkerWdPoStAPI interface { + WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs []string, workerName string) error + WdPoStAllocateTasks(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) + WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error + WdPoStResetTask(ctx context.Context, taskID string) error + WdPoStAllTasks(ctx context.Context) ([]*WdPoStTask, error) +} diff --git a/damocles-manager/core/client.go b/damocles-manager/core/client.go index c5acad9cf..6e40652ce 100644 --- a/damocles-manager/core/client.go +++ b/damocles-manager/core/client.go @@ -1,210 +1,17 @@ package core -import ( - "context" - - "github.com/filecoin-project/go-address" - "github.com/filecoin-project/go-bitfield" - "github.com/filecoin-project/go-state-types/abi" - "github.com/ipfs-force-community/damocles/damocles-manager/modules" - "github.com/ipfs/go-cid" - - "github.com/filecoin-project/venus/venus-shared/actors/builtin" - "github.com/filecoin-project/venus/venus-shared/types" -) - -var UnavailableSealerCliClient = SealerCliClient{ - ListSectors: func(context.Context, SectorWorkerState, SectorWorkerJob) ([]*SectorState, error) { - panic("sealer client unavailable") - }, - - FindSector: func(ctx context.Context, state SectorWorkerState, sid abi.SectorID) (*SectorState, error) { - panic("sealer client unavailable") - }, - - FindSectorInAllStates: func(ctx context.Context, sid abi.SectorID) (*SectorState, error) { - panic("sealer client unavailable") - }, - - FindSectorsWithDeal: func(ctx context.Context, state SectorWorkerState, dealID abi.DealID) ([]*SectorState, error) { - panic("sealer client unavailable") - }, - - FindSectorWithPiece: func(ctx context.Context, state SectorWorkerState, pieceCid cid.Cid) (*SectorState, error) { - panic("sealer client unavailable") - }, - - ImportSector: func(ctx context.Context, ws SectorWorkerState, state *SectorState, override bool) (bool, error) { - panic("sealer client unavailable") - }, - - RestoreSector: func(ctx context.Context, sid abi.SectorID, forced bool) (Meta, error) { - panic("sealer client unavailable") - }, - - ReportFinalized: func(context.Context, abi.SectorID) (Meta, error) { panic("sealer client unavailable") }, - - ReportAborted: func(context.Context, abi.SectorID, string) (Meta, error) { panic("sealer client unavailable") }, - - CheckProvable: func(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) { - panic("sealer client unavailable") - }, - - SimulateWdPoSt: func(context.Context, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error { - panic("sealer client unavailable") - }, - - SnapUpPreFetch: func(ctx context.Context, mid abi.ActorID, dlindex *uint64) (*SnapUpFetchResult, error) { - panic("sealer client unavailable") - }, - - SnapUpCandidates: func(ctx context.Context, mid abi.ActorID) ([]*bitfield.BitField, error) { - panic("sealer client unavailable") - }, - - SnapUpCancelCommitment: func(ctx context.Context, sid abi.SectorID) error { - panic("sealer client unavailable") - }, - - ProvingSectorInfo: func(ctx context.Context, sid abi.SectorID) (ProvingSectorInfo, error) { - panic("sealer client unavailable") - }, - - WorkerGetPingInfo: func(ctx context.Context, name string) (*WorkerPingInfo, error) { - panic("sealer client unavailable") - }, - - WorkerPingInfoList: func(ctx context.Context) ([]WorkerPingInfo, error) { - panic("sealer client unavailable") - }, - - WorkerPingInfoRemove: func(ctx context.Context, name string) error { - panic("sealer client unavailable") - }, - - SectorIndexerFind: func(ctx context.Context, indexType SectorIndexType, sid abi.SectorID) (SectorIndexLocation, error) { - panic("sealer client unavailable") - }, - - TerminateSector: func(context.Context, abi.SectorID) (SubmitTerminateResp, error) { - panic("sealer client unavailable") - }, - - PollTerminateSectorState: func(context.Context, abi.SectorID) (TerminateInfo, error) { - panic("sealer client unavailable") - }, - - RemoveSector: func(context.Context, abi.SectorID) error { - panic("sealer client unavailable") - }, - - FinalizeSector: func(context.Context, abi.SectorID) error { - panic("sealer client unavailable") - }, - - StoreReleaseReserved: func(ctx context.Context, sid abi.SectorID) (bool, error) { - panic("sealer client unavailable") - }, - - StoreList: func(ctx context.Context) ([]StoreDetailedInfo, error) { - panic("sealer client unavailable") - }, - - SectorSetForRebuild: func(ctx context.Context, sid abi.SectorID, opt RebuildOptions) (bool, error) { - panic("sealer client unavailable") - }, - - // not listed in SealerCliAPI, but required in cli commands - SubmitPreCommit: func(context.Context, AllocatedSector, PreCommitOnChainInfo, bool) (SubmitPreCommitResp, error) { - panic("sealer client unavailable") - }, - - SubmitProof: func(context.Context, abi.SectorID, ProofOnChainInfo, bool) (SubmitProofResp, error) { - panic("sealer client unavailable") - }, - - UnsealPiece: func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid, offset types.UnpaddedByteIndex, size abi.UnpaddedPieceSize, dest string) (<-chan []byte, error) { - panic("sealer client unavailable") - }, - - Version: func(context.Context) (string, error) { - panic("sealer client unavailable") - }, -} - -type SealerCliClient struct { - ListSectors func(context.Context, SectorWorkerState, SectorWorkerJob) ([]*SectorState, error) - - FindSector func(ctx context.Context, state SectorWorkerState, sid abi.SectorID) (*SectorState, error) - - FindSectorInAllStates func(ctx context.Context, sid abi.SectorID) (*SectorState, error) - - FindSectorsWithDeal func(ctx context.Context, state SectorWorkerState, dealID abi.DealID) ([]*SectorState, error) - - FindSectorWithPiece func(ctx context.Context, state SectorWorkerState, pieceCid cid.Cid) (*SectorState, error) - - ImportSector func(ctx context.Context, ws SectorWorkerState, state *SectorState, override bool) (bool, error) - - RestoreSector func(ctx context.Context, sid abi.SectorID, forced bool) (Meta, error) - - ReportFinalized func(context.Context, abi.SectorID) (Meta, error) - - ReportAborted func(context.Context, abi.SectorID, string) (Meta, error) - - CheckProvable func(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) - - SimulateWdPoSt func(context.Context, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error - - SnapUpPreFetch func(ctx context.Context, mid abi.ActorID, dlindex *uint64) (*SnapUpFetchResult, error) - - SnapUpCandidates func(ctx context.Context, mid abi.ActorID) ([]*bitfield.BitField, error) - - SnapUpCancelCommitment func(ctx context.Context, sid abi.SectorID) error - - ProvingSectorInfo func(ctx context.Context, sid abi.SectorID) (ProvingSectorInfo, error) - - WorkerGetPingInfo func(ctx context.Context, name string) (*WorkerPingInfo, error) - - WorkerPingInfoList func(ctx context.Context) ([]WorkerPingInfo, error) - - WorkerPingInfoRemove func(ctx context.Context, name string) error - - SectorIndexerFind func(ctx context.Context, indexType SectorIndexType, sid abi.SectorID) (SectorIndexLocation, error) - - TerminateSector func(context.Context, abi.SectorID) (SubmitTerminateResp, error) - - PollTerminateSectorState func(context.Context, abi.SectorID) (TerminateInfo, error) - - RemoveSector func(context.Context, abi.SectorID) error - - FinalizeSector func(context.Context, abi.SectorID) error - - StoreReleaseReserved func(ctx context.Context, sid abi.SectorID) (bool, error) - - StoreList func(ctx context.Context) ([]StoreDetailedInfo, error) - - SectorSetForRebuild func(ctx context.Context, sid abi.SectorID, opt RebuildOptions) (bool, error) - - // not listed in SealerCliAPI, but required in cli commands - SubmitPreCommit func(context.Context, AllocatedSector, PreCommitOnChainInfo, bool) (SubmitPreCommitResp, error) - - SubmitProof func(context.Context, abi.SectorID, ProofOnChainInfo, bool) (SubmitProofResp, error) - - UnsealPiece func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid, offset types.UnpaddedByteIndex, size abi.UnpaddedPieceSize, dest string) (<-chan []byte, error) - - Version func(context.Context) (string, error) -} - -var UnavailableMinerAPIClient = MinerAPIClient{ - GetInfo: func(context.Context, abi.ActorID) (*MinerInfo, error) { - panic("damocles miner client unavailable") - }, - GetMinerConfig: func(context.Context, abi.ActorID) (*modules.MinerConfig, error) { - panic("damocles miner client unavailable") - }, +var UnavailableAPIClient = APIClient{ + SealerAPIClient: UnavailableSealerAPIClient, + SealerCliAPIClient: UnavailableSealerCliAPIClient, + RandomnessAPIClient: UnavailableRandomnessAPIClient, + MinerAPIClient: UnavailableMinerAPIClient, + WorkerWdPoStAPIClient: UnavailableWorkerWdPoStAPIClient, } -type MinerAPIClient struct { - GetInfo func(context.Context, abi.ActorID) (*MinerInfo, error) - GetMinerConfig func(context.Context, abi.ActorID) (*modules.MinerConfig, error) +type APIClient struct { + SealerAPIClient + SealerCliAPIClient + RandomnessAPIClient + MinerAPIClient + WorkerWdPoStAPIClient } diff --git a/damocles-manager/core/client_gen.go b/damocles-manager/core/client_gen.go new file mode 100644 index 000000000..c29808b2f --- /dev/null +++ b/damocles-manager/core/client_gen.go @@ -0,0 +1,291 @@ +// Generated by gen.go. Do not edit. + +package core + +import ( + "context" + "github.com/filecoin-project/go-address" + "github.com/filecoin-project/go-bitfield" + "github.com/filecoin-project/go-state-types/abi" + "github.com/filecoin-project/venus/venus-shared/actors/builtin" + "github.com/filecoin-project/venus/venus-shared/types" + "github.com/ipfs-force-community/damocles/damocles-manager/modules" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" + "github.com/ipfs/go-cid" +) + +// SealerAPIClient is generated client for SealerAPI interface. +type SealerAPIClient struct { + AllocateSector func(context.Context, AllocateSectorSpec) (*AllocatedSector, error) + AcquireDeals func(context.Context, abi.SectorID, AcquireDealsSpec) (Deals, error) + AssignTicket func(context.Context, abi.SectorID) (Ticket, error) + SubmitPreCommit func(context.Context, AllocatedSector, PreCommitOnChainInfo, bool) (SubmitPreCommitResp, error) + PollPreCommitState func(context.Context, abi.SectorID) (PollPreCommitStateResp, error) + SubmitPersisted func(context.Context, abi.SectorID, string) (bool, error) + SubmitPersistedEx func(ctx context.Context, sid abi.SectorID, instanceName string, isUpgrade bool) (bool, error) + WaitSeed func(context.Context, abi.SectorID) (WaitSeedResp, error) + SubmitProof func(context.Context, abi.SectorID, ProofOnChainInfo, bool) (SubmitProofResp, error) + PollProofState func(context.Context, abi.SectorID) (PollProofStateResp, error) + ReportState func(context.Context, abi.SectorID, ReportStateReq) (Meta, error) + ReportFinalized func(context.Context, abi.SectorID) (Meta, error) + ReportAborted func(context.Context, abi.SectorID, string) (Meta, error) + AllocateSanpUpSector func(ctx context.Context, spec AllocateSnapUpSpec) (*AllocatedSnapUpSector, error) + SubmitSnapUpProof func(ctx context.Context, sid abi.SectorID, snapupInfo SnapUpOnChainInfo) (SubmitSnapUpProofResp, error) + AllocateRebuildSector func(ctx context.Context, spec AllocateSectorSpec) (*SectorRebuildInfo, error) + WorkerPing func(ctx context.Context, winfo WorkerInfo) (Meta, error) + StoreReserveSpace func(ctx context.Context, sid abi.SectorID, size uint64, candidates []string) (*StoreBasicInfo, error) + StoreBasicInfo func(ctx context.Context, instanceName string) (*StoreBasicInfo, error) + AllocateUnsealSector func(ctx context.Context, spec AllocateSectorSpec) (*SectorUnsealInfo, error) + AchieveUnsealSector func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid, errInfo string) (Meta, error) + AcquireUnsealDest func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid) ([]string, error) +} + +var UnavailableSealerAPIClient = SealerAPIClient{ + + AllocateSector: func(context.Context, AllocateSectorSpec) (*AllocatedSector, error) { + panic("SealerAPI client unavailable") + }, + AcquireDeals: func(context.Context, abi.SectorID, AcquireDealsSpec) (Deals, error) { + panic("SealerAPI client unavailable") + }, + AssignTicket: func(context.Context, abi.SectorID) (Ticket, error) { + panic("SealerAPI client unavailable") + }, + SubmitPreCommit: func(context.Context, AllocatedSector, PreCommitOnChainInfo, bool) (SubmitPreCommitResp, error) { + panic("SealerAPI client unavailable") + }, + PollPreCommitState: func(context.Context, abi.SectorID) (PollPreCommitStateResp, error) { + panic("SealerAPI client unavailable") + }, + SubmitPersisted: func(context.Context, abi.SectorID, string) (bool, error) { + panic("SealerAPI client unavailable") + }, + SubmitPersistedEx: func(ctx context.Context, sid abi.SectorID, instanceName string, isUpgrade bool) (bool, error) { + panic("SealerAPI client unavailable") + }, + WaitSeed: func(context.Context, abi.SectorID) (WaitSeedResp, error) { + panic("SealerAPI client unavailable") + }, + SubmitProof: func(context.Context, abi.SectorID, ProofOnChainInfo, bool) (SubmitProofResp, error) { + panic("SealerAPI client unavailable") + }, + PollProofState: func(context.Context, abi.SectorID) (PollProofStateResp, error) { + panic("SealerAPI client unavailable") + }, + ReportState: func(context.Context, abi.SectorID, ReportStateReq) (Meta, error) { + panic("SealerAPI client unavailable") + }, + ReportFinalized: func(context.Context, abi.SectorID) (Meta, error) { + panic("SealerAPI client unavailable") + }, + ReportAborted: func(context.Context, abi.SectorID, string) (Meta, error) { + panic("SealerAPI client unavailable") + }, + AllocateSanpUpSector: func(ctx context.Context, spec AllocateSnapUpSpec) (*AllocatedSnapUpSector, error) { + panic("SealerAPI client unavailable") + }, + SubmitSnapUpProof: func(ctx context.Context, sid abi.SectorID, snapupInfo SnapUpOnChainInfo) (SubmitSnapUpProofResp, error) { + panic("SealerAPI client unavailable") + }, + AllocateRebuildSector: func(ctx context.Context, spec AllocateSectorSpec) (*SectorRebuildInfo, error) { + panic("SealerAPI client unavailable") + }, + WorkerPing: func(ctx context.Context, winfo WorkerInfo) (Meta, error) { + panic("SealerAPI client unavailable") + }, + StoreReserveSpace: func(ctx context.Context, sid abi.SectorID, size uint64, candidates []string) (*StoreBasicInfo, error) { + panic("SealerAPI client unavailable") + }, + StoreBasicInfo: func(ctx context.Context, instanceName string) (*StoreBasicInfo, error) { + panic("SealerAPI client unavailable") + }, + AllocateUnsealSector: func(ctx context.Context, spec AllocateSectorSpec) (*SectorUnsealInfo, error) { + panic("SealerAPI client unavailable") + }, + AchieveUnsealSector: func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid, errInfo string) (Meta, error) { + panic("SealerAPI client unavailable") + }, + AcquireUnsealDest: func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid) ([]string, error) { + panic("SealerAPI client unavailable") + }, +} + +// SealerCliAPIClient is generated client for SealerCliAPI interface. +type SealerCliAPIClient struct { + ListSectors func(context.Context, SectorWorkerState, SectorWorkerJob) ([]*SectorState, error) + FindSector func(ctx context.Context, state SectorWorkerState, sid abi.SectorID) (*SectorState, error) + FindSectorInAllStates func(ctx context.Context, sid abi.SectorID) (*SectorState, error) + FindSectorsWithDeal func(ctx context.Context, state SectorWorkerState, dealID abi.DealID) ([]*SectorState, error) + FindSectorWithPiece func(ctx context.Context, state SectorWorkerState, pieceCid cid.Cid) (*SectorState, error) + ImportSector func(ctx context.Context, ws SectorWorkerState, state *SectorState, override bool) (bool, error) + RestoreSector func(ctx context.Context, sid abi.SectorID, forced bool) (Meta, error) + CheckProvable func(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) + SimulateWdPoSt func(context.Context, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error + SnapUpPreFetch func(ctx context.Context, mid abi.ActorID, dlindex *uint64) (*SnapUpFetchResult, error) + SnapUpCandidates func(ctx context.Context, mid abi.ActorID) ([]*bitfield.BitField, error) + SnapUpCancelCommitment func(ctx context.Context, sid abi.SectorID) error + ProvingSectorInfo func(ctx context.Context, sid abi.SectorID) (ProvingSectorInfo, error) + WorkerGetPingInfo func(ctx context.Context, name string) (*WorkerPingInfo, error) + WorkerPingInfoList func(ctx context.Context) ([]WorkerPingInfo, error) + WorkerPingInfoRemove func(ctx context.Context, name string) error + SectorIndexerFind func(ctx context.Context, indexType SectorIndexType, sid abi.SectorID) (SectorIndexLocation, error) + TerminateSector func(context.Context, abi.SectorID) (SubmitTerminateResp, error) + PollTerminateSectorState func(context.Context, abi.SectorID) (TerminateInfo, error) + RemoveSector func(context.Context, abi.SectorID) error + FinalizeSector func(context.Context, abi.SectorID) error + StoreReleaseReserved func(ctx context.Context, sid abi.SectorID) (bool, error) + StoreList func(ctx context.Context) ([]StoreDetailedInfo, error) + SectorSetForRebuild func(ctx context.Context, sid abi.SectorID, opt RebuildOptions) (bool, error) + UnsealPiece func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid, offset types.UnpaddedByteIndex, size abi.UnpaddedPieceSize, dest string) (<-chan []byte, error) + Version func(ctx context.Context) (string, error) +} + +var UnavailableSealerCliAPIClient = SealerCliAPIClient{ + + ListSectors: func(context.Context, SectorWorkerState, SectorWorkerJob) ([]*SectorState, error) { + panic("SealerCliAPI client unavailable") + }, + FindSector: func(ctx context.Context, state SectorWorkerState, sid abi.SectorID) (*SectorState, error) { + panic("SealerCliAPI client unavailable") + }, + FindSectorInAllStates: func(ctx context.Context, sid abi.SectorID) (*SectorState, error) { + panic("SealerCliAPI client unavailable") + }, + FindSectorsWithDeal: func(ctx context.Context, state SectorWorkerState, dealID abi.DealID) ([]*SectorState, error) { + panic("SealerCliAPI client unavailable") + }, + FindSectorWithPiece: func(ctx context.Context, state SectorWorkerState, pieceCid cid.Cid) (*SectorState, error) { + panic("SealerCliAPI client unavailable") + }, + ImportSector: func(ctx context.Context, ws SectorWorkerState, state *SectorState, override bool) (bool, error) { + panic("SealerCliAPI client unavailable") + }, + RestoreSector: func(ctx context.Context, sid abi.SectorID, forced bool) (Meta, error) { + panic("SealerCliAPI client unavailable") + }, + CheckProvable: func(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) { + panic("SealerCliAPI client unavailable") + }, + SimulateWdPoSt: func(context.Context, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error { + panic("SealerCliAPI client unavailable") + }, + SnapUpPreFetch: func(ctx context.Context, mid abi.ActorID, dlindex *uint64) (*SnapUpFetchResult, error) { + panic("SealerCliAPI client unavailable") + }, + SnapUpCandidates: func(ctx context.Context, mid abi.ActorID) ([]*bitfield.BitField, error) { + panic("SealerCliAPI client unavailable") + }, + SnapUpCancelCommitment: func(ctx context.Context, sid abi.SectorID) error { + panic("SealerCliAPI client unavailable") + }, + ProvingSectorInfo: func(ctx context.Context, sid abi.SectorID) (ProvingSectorInfo, error) { + panic("SealerCliAPI client unavailable") + }, + WorkerGetPingInfo: func(ctx context.Context, name string) (*WorkerPingInfo, error) { + panic("SealerCliAPI client unavailable") + }, + WorkerPingInfoList: func(ctx context.Context) ([]WorkerPingInfo, error) { + panic("SealerCliAPI client unavailable") + }, + WorkerPingInfoRemove: func(ctx context.Context, name string) error { + panic("SealerCliAPI client unavailable") + }, + SectorIndexerFind: func(ctx context.Context, indexType SectorIndexType, sid abi.SectorID) (SectorIndexLocation, error) { + panic("SealerCliAPI client unavailable") + }, + TerminateSector: func(context.Context, abi.SectorID) (SubmitTerminateResp, error) { + panic("SealerCliAPI client unavailable") + }, + PollTerminateSectorState: func(context.Context, abi.SectorID) (TerminateInfo, error) { + panic("SealerCliAPI client unavailable") + }, + RemoveSector: func(context.Context, abi.SectorID) error { + panic("SealerCliAPI client unavailable") + }, + FinalizeSector: func(context.Context, abi.SectorID) error { + panic("SealerCliAPI client unavailable") + }, + StoreReleaseReserved: func(ctx context.Context, sid abi.SectorID) (bool, error) { + panic("SealerCliAPI client unavailable") + }, + StoreList: func(ctx context.Context) ([]StoreDetailedInfo, error) { + panic("SealerCliAPI client unavailable") + }, + SectorSetForRebuild: func(ctx context.Context, sid abi.SectorID, opt RebuildOptions) (bool, error) { + panic("SealerCliAPI client unavailable") + }, + UnsealPiece: func(ctx context.Context, sid abi.SectorID, pieceCid cid.Cid, offset types.UnpaddedByteIndex, size abi.UnpaddedPieceSize, dest string) (<-chan []byte, error) { + panic("SealerCliAPI client unavailable") + }, + Version: func(ctx context.Context) (string, error) { + panic("SealerCliAPI client unavailable") + }, +} + +// RandomnessAPIClient is generated client for RandomnessAPI interface. +type RandomnessAPIClient struct { + GetTicket func(context.Context, types.TipSetKey, abi.ChainEpoch, abi.ActorID) (Ticket, error) + GetSeed func(context.Context, types.TipSetKey, abi.ChainEpoch, abi.ActorID) (Seed, error) + GetWindowPoStChanlleengeRand func(context.Context, types.TipSetKey, abi.ChainEpoch, abi.ActorID) (WindowPoStRandomness, error) + GetWindowPoStCommitRand func(context.Context, types.TipSetKey, abi.ChainEpoch) (WindowPoStRandomness, error) +} + +var UnavailableRandomnessAPIClient = RandomnessAPIClient{ + + GetTicket: func(context.Context, types.TipSetKey, abi.ChainEpoch, abi.ActorID) (Ticket, error) { + panic("RandomnessAPI client unavailable") + }, + GetSeed: func(context.Context, types.TipSetKey, abi.ChainEpoch, abi.ActorID) (Seed, error) { + panic("RandomnessAPI client unavailable") + }, + GetWindowPoStChanlleengeRand: func(context.Context, types.TipSetKey, abi.ChainEpoch, abi.ActorID) (WindowPoStRandomness, error) { + panic("RandomnessAPI client unavailable") + }, + GetWindowPoStCommitRand: func(context.Context, types.TipSetKey, abi.ChainEpoch) (WindowPoStRandomness, error) { + panic("RandomnessAPI client unavailable") + }, +} + +// MinerAPIClient is generated client for MinerAPI interface. +type MinerAPIClient struct { + GetInfo func(context.Context, abi.ActorID) (*MinerInfo, error) + GetMinerConfig func(context.Context, abi.ActorID) (*modules.MinerConfig, error) +} + +var UnavailableMinerAPIClient = MinerAPIClient{ + + GetInfo: func(context.Context, abi.ActorID) (*MinerInfo, error) { + panic("MinerAPI client unavailable") + }, + GetMinerConfig: func(context.Context, abi.ActorID) (*modules.MinerConfig, error) { + panic("MinerAPI client unavailable") + }, +} + +// WorkerWdPoStAPIClient is generated client for WorkerWdPoStAPI interface. +type WorkerWdPoStAPIClient struct { + WdPoStHeartbeatTask func(ctx context.Context, runningTaskIDs []string, workerName string) error + WdPoStAllocateTasks func(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) + WdPoStFinishTask func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error + WdPoStResetTask func(ctx context.Context, taskID string) error + WdPoStAllTasks func(ctx context.Context) ([]*WdPoStTask, error) +} + +var UnavailableWorkerWdPoStAPIClient = WorkerWdPoStAPIClient{ + + WdPoStHeartbeatTask: func(ctx context.Context, runningTaskIDs []string, workerName string) error { + panic("WorkerWdPoStAPI client unavailable") + }, + WdPoStAllocateTasks: func(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) { + panic("WorkerWdPoStAPI client unavailable") + }, + WdPoStFinishTask: func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { + panic("WorkerWdPoStAPI client unavailable") + }, + WdPoStResetTask: func(ctx context.Context, taskID string) error { + panic("WorkerWdPoStAPI client unavailable") + }, + WdPoStAllTasks: func(ctx context.Context) ([]*WdPoStTask, error) { + panic("WorkerWdPoStAPI client unavailable") + }, +} diff --git a/damocles-manager/core/gen.go b/damocles-manager/core/gen.go new file mode 100644 index 000000000..6899ceff9 --- /dev/null +++ b/damocles-manager/core/gen.go @@ -0,0 +1,244 @@ +//go:build ignore +// +build ignore + +package main + +import ( + "bytes" + "flag" + "fmt" + "go/ast" + "go/format" + "go/parser" + "go/printer" + "go/token" + "log" + "os" + "strings" + "text/template" + + "golang.org/x/exp/slices" +) + +const apiTemplate = ` +// Generated by gen.go. Do not edit. + +package {{.Package}} + +import ( +{{range $key, $value := .Imports}} {{$value}} {{$key}} +{{end}}) + +{{range .APIs}} +// {{.Type}}Client is generated client for {{.Type}} interface. +type {{.Type}}Client struct { +{{range .Methods}} +{{.Name}} func({{.Parameters | functionargs}}) ({{.Results | functionargs}}{{if .Results}}, {{end}}){{end}} +} + +{{$type := .Type}} +var Unavailable{{.Type}}Client = {{.Type}}Client{ +{{range .Methods}} +{{.Name}}: func({{.Parameters | functionargs}}) ({{.Results | functionargs}}{{if .Results}}, {{end}}) { + panic("{{$type}} client unavailable") +},{{end}} +} + +{{end}} +` + +var ( + interfaceNames = flag.String("interface", "", "comma-separated list of interface names; must be set") + output = flag.String("output", "", "output file name; default srcdir/client.go") +) + +func main() { + log.SetFlags(0) + log.SetPrefix("gen: ") + flag.Parse() + + if len(*interfaceNames) == 0 { + flag.Usage() + os.Exit(2) + } + srcFile := os.Getenv("GOFILE") + allowTypes := strings.Split(*interfaceNames, ",") + + fileset := token.NewFileSet() + f, err := parser.ParseFile(fileset, srcFile, nil, 0) + if err != nil { + log.Fatalf("failed to parse %s: %s", srcFile, err) + } + g := &Generator{ + Package: f.Name.Name, + allowTypes: allowTypes, + fileset: fileset, + Imports: make(map[string]string), + } + ast.Walk(g, f) + + funcs := map[string]interface{}{ + "functionargs": func(fields []*Type) string { return FieldList(fields, "", ", ", true) }, + } + t, err := template.New("client").Funcs(funcs).Parse(apiTemplate) + if err != nil { + log.Fatalf("failed to parse template: %s", err) + } + buf := bytes.Buffer{} + err = t.Execute(&buf, g) + if err != nil { + log.Fatalf("failed to execute template: %s", err) + } + + // Write to file. + outputName := *output + if outputName == "" { + outputName = "client_gen.go" + } + dst, err := format.Source(buf.Bytes()) + if err != nil { + log.Fatalf("failed to format code: %s", err) + } + err = os.WriteFile(outputName, dst, 0644) + if err != nil { + log.Fatalf("writing output: %s", err) + } +} + +// isDirectory reports whether the named file is a directory. +func isDirectory(name string) bool { + info, err := os.Stat(name) + if err != nil { + log.Fatal(err) + } + return info.IsDir() +} + +func FieldList(fields []*Type, prefix string, delim string, withTypes bool) string { + var out []string + for _, p := range fields { + suffix := "" + if withTypes { + suffix = " " + p.Type + } + names := p.Names + var field []string + for _, n := range names { + field = append(field, prefix+n) + } + out = append(out, strings.Join(field, ", ")+suffix) + } + return strings.Join(out, delim) +} + +type Type struct { + Names []string + Type string +} + +func (t *Type) NamesString() string { + return strings.Join(t.Names, ", ") +} + +type Method struct { + Name string + Parameters []*Type + Results []*Type +} + +type API struct { + Type string + Methods []*Method +} + +type Generator struct { + fileset *token.FileSet + allowTypes []string + + Package string + Imports map[string]string + APIs []API +} + +func (g *Generator) Visit(node ast.Node) (w ast.Visitor) { + switch n := node.(type) { + case *ast.ImportSpec: + if n.Name != nil { + g.Imports[n.Path.Value] = n.Name.Name + } else { + g.Imports[n.Path.Value] = "" + } + case *ast.TypeSpec: + name := n.Name.Name + if slices.Contains(g.allowTypes, name) { + return &InterfaceGen{Generator: g, Name: name} + } + } + return g +} + +type InterfaceGen struct { + *Generator + Name string +} + +func (ig *InterfaceGen) VisitMethodList(n *ast.InterfaceType) { + client := API{ + Type: ig.Name, + Methods: []*Method{}, + } + for _, m := range n.Methods.List { + switch t := m.Type.(type) { + case *ast.FuncType: + method := &Method{ + Name: m.Names[0].Name, + Parameters: make([]*Type, 0), + Results: make([]*Type, 0), + } + for _, v := range t.Params.List { + method.Parameters = append(method.Parameters, ig.formatType(ig.fileset, v)) + } + hasError := false + if t.Results != nil { + for _, v := range t.Results.List { + result := ig.formatType(ig.fileset, v) + if result.Type == "error" { + hasError = true + } + method.Results = append(method.Results, result) + } + } + if !hasError { + fatalNode(ig.fileset, m, "method %s must have error as last return value", method.Name) + } + client.Methods = append(client.Methods, method) + case *ast.Ident: + // Embedded interface + ig.VisitMethodList(t.Obj.Decl.(*ast.TypeSpec).Type.(*ast.InterfaceType)) + } + } + ig.APIs = append(ig.APIs, client) +} + +func (ig *InterfaceGen) Visit(node ast.Node) (w ast.Visitor) { + switch n := node.(type) { + case *ast.InterfaceType: + ig.VisitMethodList(n) + } + return ig.Generator +} + +func (ig *InterfaceGen) formatType(fileset *token.FileSet, field *ast.Field) *Type { + var typeBuf bytes.Buffer + _ = printer.Fprint(&typeBuf, fileset, field.Type) + t := &Type{Type: typeBuf.String()} + for _, n := range field.Names { + t.Names = append(t.Names, n.Name) + } + return t +} + +func fatalNode(fileset *token.FileSet, node ast.Node, format string, args ...interface{}) { + log.Fatalf("%s: error: %s: %s\n", os.Args[0], fileset.Position(node.Pos()).String(), fmt.Sprintf(format, args...)) + os.Exit(1) +} diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go new file mode 100644 index 000000000..1107c391b --- /dev/null +++ b/damocles-manager/core/types_wdpost.go @@ -0,0 +1,60 @@ +package core + +import ( + "context" + "time" + + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" +) + +type WdPoStTaskState string + +const ( + WdPoStTaskReadyToRun WdPoStTaskState = "ready2run" + WdPoStTaskRunning WdPoStTaskState = "running" + WdPoStTaskFinished WdPoStTaskState = "finished" +) + +type WdPoStTask struct { + ID string + Input stage.WindowPoSt + Output *stage.WindowPoStOutput + TryNum uint32 + ErrorReason string + WorkerName string + StartedAt uint64 + HeartbeatAt uint64 + FinishedAt uint64 + CreatedAt uint64 + UpdatedAt uint64 +} + +func (t *WdPoStTask) Finished(maxTry uint32) bool { + if t.FinishedAt == 0 { + return false + } + + if t.ErrorReason != "" && t.TryNum < maxTry { + return false + } + + return true +} + +type WdPoStAllocatedTask struct { + ID string + Input stage.WindowPoSt +} + +type WorkerWdPoStTaskManager interface { + All(ctx context.Context, filter func(*WdPoStTask) bool) ([]*WdPoStTask, error) + ListByTaskIDs(ctx context.Context, state WdPoStTaskState, taskIDs ...string) ([]*WdPoStTask, error) + Create(ctx context.Context, input stage.WindowPoSt) (*WdPoStTask, error) + AllocateTasks(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) + Heartbeat(ctx context.Context, taskIDs []string, workerName string) error + Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error + MakeTasksDie(ctx context.Context, shouldDeadDur time.Duration, limit uint32) error + CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error + RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error + Reset(ctx context.Context, taskID string) error +} diff --git a/damocles-manager/dep/sealer.go b/damocles-manager/dep/sealer.go index 50c9ef1e2..029bf19a1 100644 --- a/damocles-manager/dep/sealer.go +++ b/damocles-manager/dep/sealer.go @@ -11,6 +11,7 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/modules" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/mock" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover" + proverworker "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover/worker" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/randomness" "github.com/ipfs-force-community/damocles/damocles-manager/modules/sealer" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/chain" @@ -79,6 +80,10 @@ func Product() dix.Option { dix.Override(new(OfflineMetaStore), BuildOfflineMetaStore), dix.Override(new(WorkerMetaStore), BuildWorkerMetaStore), dix.Override(new(CommonMetaStore), BuildCommonMetaStore), + dix.Override(new(WorkerProverStore), BuildWorkerProverStore), + + dix.Override(new(core.WorkerWdPoStTaskManager), BuildWorkerWdPoStTaskManager), + dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewWdPoStAPIImpl), ) } @@ -89,7 +94,8 @@ type ProxyOptions struct { func Proxy(dest string, opt ProxyOptions) dix.Option { return dix.Options( dix.Override(new(ProxyAddress), ProxyAddress(dest)), - dix.Override(new(core.SealerCliClient), BuildSealerProxyClient), + dix.Override(new(core.APIClient), BuildAPIProxyClient), + dix.Override(new(core.SealerCliAPIClient), BuildSealerCliAPIClient), dix.If(opt.EnableSectorIndexer, dix.Override(new(core.SectorIndexer), BuildProxiedSectorIndex), ), @@ -114,10 +120,10 @@ func APIClient(target ...interface{}) dix.Option { dix.Override(new(*modules.Config), ProvideConfig), dix.Override(new(*modules.SafeConfig), ProvideSafeConfig), dix.Override(new(chain.API), BuildChainClient), - dix.Override(new(core.MinerAPIClient), MaybeMinerAPIClient), dix.Override(new(messager.API), BuildMessagerClient), dix.Override(new(market.API), BuildMarketAPI), - dix.Override(new(core.SealerCliClient), MaybeSealerCliClient), + dix.Override(new(core.APIClient), MaybeAPIClient), + dix.Override(new(core.SealerCliAPIClient), BuildSealerCliAPIClient), dix.If(len(target) > 0, dix.Populate(InvokePopulate, target...)), ) } diff --git a/damocles-manager/dep/sealer_constructor.go b/damocles-manager/dep/sealer_constructor.go index 38ea035dc..099cac31d 100644 --- a/damocles-manager/dep/sealer_constructor.go +++ b/damocles-manager/dep/sealer_constructor.go @@ -19,6 +19,7 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/commitmgr" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/dealmgr" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/mock" + proverworker "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover/worker" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/sectors" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/worker" "github.com/ipfs-force-community/damocles/damocles-manager/modules/policy" @@ -46,6 +47,7 @@ type ( WorkerMetaStore kvstore.KVStore ConfDirPath string CommonMetaStore kvstore.KVStore + WorkerProverStore kvstore.KVStore ) func BuildLocalSectorManager(scfg *modules.SafeConfig, mapi core.MinerAPI, numAlloc core.SectorNumberAllocator) (core.SectorManager, error) { @@ -292,34 +294,26 @@ func BuildMessagerClient(gctx GlobalContext, lc fx.Lifecycle, scfg *modules.Conf return mcli, nil } -// used for cli commands -func MaybeSealerCliClient(gctx GlobalContext, lc fx.Lifecycle, listen ListenAddress) core.SealerCliClient { - var cli core.SealerCliClient - err := buildDamoclesAPIClient(gctx, lc, core.SealerAPINamespace, &cli, string(listen), false) +func MaybeAPIClient(gctx GlobalContext, lc fx.Lifecycle, listen ListenAddress) *core.APIClient { + var client core.APIClient + err := buildDamoclesAPIClient(gctx, lc, core.APINamespace, &client, string(listen), false) if err != nil { - log.Errorf("failed to build sealer cli client. err: %s", err) - cli = core.UnavailableSealerCliClient + log.Errorf("failed to build api client. err: %s", err) + client = core.UnavailableAPIClient } - return cli + return &client } -// used for cli commands -func MaybeMinerAPIClient(gctx GlobalContext, lc fx.Lifecycle, listen ListenAddress) core.MinerAPIClient { - var c core.MinerAPIClient - err := buildDamoclesAPIClient(gctx, lc, core.MinerAPINamespace, &c, string(listen), false) - if err != nil { - log.Errorf("failed to build miner api client. err: %s", err) - c = core.UnavailableMinerAPIClient - } - return c +// used for proxy +func BuildAPIProxyClient(gctx GlobalContext, lc fx.Lifecycle, proxy ProxyAddress) (*core.APIClient, error) { + var proxyClient core.APIClient + err := buildDamoclesAPIClient(gctx, lc, core.APINamespace, &proxyClient, string(proxy), true) + return &proxyClient, err } -// used for proxy -func BuildSealerProxyClient(gctx GlobalContext, lc fx.Lifecycle, proxy ProxyAddress) (core.SealerCliClient, error) { - var cli core.SealerCliClient - err := buildDamoclesAPIClient(gctx, lc, core.SealerAPINamespace, &cli, string(proxy), true) - return cli, err +func BuildSealerCliAPIClient(client *core.APIClient) *core.SealerCliAPIClient { + return &client.SealerCliAPIClient } func buildDamoclesAPIClient(gctx GlobalContext, lc fx.Lifecycle, namespace string, out interface{}, serverAddr string, useHTTP bool) error { @@ -677,7 +671,7 @@ func BuildWorkerManager(meta WorkerMetaStore) (core.WorkerManager, error) { return worker.NewManager(meta) } -func BuildProxiedSectorIndex(client core.SealerCliClient, storeMgr PersistedObjectStoreManager) (core.SectorIndexer, error) { +func BuildProxiedSectorIndex(client *core.SealerCliAPIClient, storeMgr PersistedObjectStoreManager) (core.SectorIndexer, error) { log.Debug("build proxied sector indexer") return sectors.NewProxiedIndexer(client, storeMgr) } @@ -703,3 +697,15 @@ func BuildUnsealManager( } return mgr, nil } + +func BuildWorkerProverStore(gctx GlobalContext, db UnderlyingDB) (WorkerProverStore, error) { + return db.OpenCollection(gctx, "prover") +} + +func BuildWorkerWdPoStTaskManager(kv WorkerProverStore) (core.WorkerWdPoStTaskManager, error) { + wdpostKV, err := kvstore.NewWrappedKVStore([]byte("wdpost-"), kv) + if err != nil { + return nil, err + } + return proverworker.NewKVTaskManager(*kvstore.NewKVExt(wdpostKV)), nil +} diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go index 694830398..415f1b6df 100644 --- a/damocles-manager/modules/impl/prover/worker/prover.go +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -2,10 +2,13 @@ package worker import ( "context" + "encoding/base64" + "encoding/binary" "fmt" "sync" "time" + "github.com/cespare/xxhash" "github.com/filecoin-project/go-state-types/abi" "github.com/filecoin-project/lotus/chain/actors/builtin" "github.com/ipfs-force-community/damocles/damocles-manager/core" @@ -16,8 +19,14 @@ import ( var log = logging.New("worker prover") +func GenTaskID(rawInput []byte) string { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) + return base64.URLEncoding.EncodeToString(b) +} + type workerProver struct { - taskMgr TaskManager + taskMgr core.WorkerWdPoStTaskManager inflightTasks map[string][]chan<- struct { output *stage.WindowPoStOutput @@ -33,7 +42,7 @@ type workerProver struct { taskLifetime time.Duration } -func NewProver(taskMgr TaskManager) core.Prover { +func NewProver(taskMgr core.WorkerWdPoStTaskManager) core.Prover { return &workerProver{ taskMgr: taskMgr, inflightTasks: make(map[string][]chan<- struct { @@ -71,7 +80,7 @@ func (p *workerProver) runNotifyTaskDoneJob(ctx context.Context) { } p.inflightTasksLock.Unlock() - finishedTasks, err := p.taskMgr.ListByTaskIDs(ctx, TaskFinished, inflightTaskIDs...) + finishedTasks, err := p.taskMgr.ListByTaskIDs(ctx, core.WdPoStTaskFinished, inflightTaskIDs...) if err != nil { log.Errorf("failed to list tasks: %s", err) } diff --git a/damocles-manager/modules/impl/prover/worker/rpc.go b/damocles-manager/modules/impl/prover/worker/rpc.go new file mode 100644 index 000000000..3a5b09d42 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/rpc.go @@ -0,0 +1,38 @@ +package worker + +import ( + "context" + + "github.com/ipfs-force-community/damocles/damocles-manager/core" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" +) + +func NewWdPoStAPIImpl(taskMgr core.WorkerWdPoStTaskManager) core.WorkerWdPoStAPI { + return &WdPoStAPIImpl{ + taskMgr: taskMgr, + } +} + +type WdPoStAPIImpl struct { + taskMgr core.WorkerWdPoStTaskManager +} + +func (api WdPoStAPIImpl) WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs []string, workerName string) error { + return api.taskMgr.Heartbeat(ctx, runningTaskIDs, workerName) +} + +func (api WdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, num uint32, workName string) (allocatedTasks []core.WdPoStAllocatedTask, err error) { + return api.taskMgr.AllocateTasks(ctx, num, workName) +} + +func (api WdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { + return api.taskMgr.Finish(ctx, taskID, output, errorReason) +} + +func (api WdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) error { + return api.taskMgr.Reset(ctx, taskID) +} + +func (api WdPoStAPIImpl) WdPoStAllTasks(ctx context.Context) ([]*core.WdPoStTask, error) { + return api.taskMgr.All(ctx, func(_ *core.WdPoStTask) bool { return true }) +} diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr.go b/damocles-manager/modules/impl/prover/worker/task_mgr.go deleted file mode 100644 index 9a089a9a0..000000000 --- a/damocles-manager/modules/impl/prover/worker/task_mgr.go +++ /dev/null @@ -1,68 +0,0 @@ -package worker - -import ( - "context" - "encoding/base64" - "encoding/binary" - "time" - - "github.com/cespare/xxhash/v2" - "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" -) - -type TaskState string - -const ( - TaskReadyToRun TaskState = "ready2run" - TaskRunning TaskState = "running" - TaskFinished TaskState = "finished" -) - -type Task struct { - ID string - Input stage.WindowPoSt - Output *stage.WindowPoStOutput - tryNum uint32 - ErrorReason string - WorkerName string - StartedAt uint64 - HeartbeatAt uint64 - FinishedAt uint64 - CreatedAt uint64 - UpdatedAt uint64 -} - -func (t *Task) Finished(maxTry uint32) bool { - if t.FinishedAt == 0 { - return false - } - - if t.ErrorReason != "" && t.tryNum < maxTry { - return false - } - - return true -} - -type AllocatedTask struct { - ID string - Input stage.WindowPoSt -} - -type TaskManager interface { - All(ctx context.Context, state TaskState, limit uint32, filter func(*Task) bool) ([]*Task, error) - ListByTaskIDs(ctx context.Context, state TaskState, taskIDs ...string) ([]*Task, error) - Create(ctx context.Context, input stage.WindowPoSt) (*Task, error) - AllocateTasks(ctx context.Context, n uint32, workName string) (allocatedTasks []AllocatedTask, err error) - Heartbeat(ctx context.Context, taskID []string, workerName string) error - Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error - MakeTasksDie(ctx context.Context, shouldDeadDur time.Duration, limit uint32) error - CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error - RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error -} - -func GenTaskID(rawInput []byte) string { - b := make([]byte, 8) - binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) - return base64.URLEncoding.EncodeToString(b) -} diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go index 439d06a31..98480ba3e 100644 --- a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go +++ b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go @@ -5,13 +5,16 @@ import ( "encoding/json" "errors" "fmt" + "math" + "strings" "time" + "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" ) -func NewKVTaskStore(kv kvstore.KVExt) TaskManager { +func NewKVTaskManager(kv kvstore.KVExt) core.WorkerWdPoStTaskManager { return &kvTaskManager{ kv: kv, } @@ -22,7 +25,7 @@ type kvTaskManager struct { } // TODO(0x5459): Consider putting `txn` into context? -func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state TaskState, limit uint32, f func(*Task) bool) (tasks []*Task, err error) { +func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state core.WdPoStTaskState, limit uint32, f func(*core.WdPoStTask) bool) (tasks []*core.WdPoStTask, err error) { var it kvstore.Iter it, err = txn.Scan([]byte(makeWdPoStPrefix(state))) if err != nil { @@ -30,7 +33,7 @@ func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state T } defer it.Close() for it.Next() && len(tasks) <= int(limit) { - var task Task + var task core.WdPoStTask if err = it.View(ctx, kvstore.LoadJSON(&task)); err != nil { return } @@ -41,19 +44,25 @@ func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state T return } -func (tm *kvTaskManager) All(ctx context.Context, state TaskState, limit uint32, filter func(*Task) bool) (tasks []*Task, err error) { +func (tm *kvTaskManager) All(ctx context.Context, filter func(*core.WdPoStTask) bool) (tasks []*core.WdPoStTask, err error) { err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - tasks, err = tm.filter(ctx, txn, state, limit, filter) + for _, state := range []core.WdPoStTaskState{core.WdPoStTaskReadyToRun, core.WdPoStTaskRunning, core.WdPoStTaskFinished} { + ts, err := tm.filter(ctx, txn, state, math.MaxUint32, filter) + if err != nil { + return err + } + tasks = append(tasks, ts...) + } return err }) return } -func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state TaskState, taskIDs ...string) ([]*Task, error) { - tasks := make([]*Task, 0, len(taskIDs)) +func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state core.WdPoStTaskState, taskIDs ...string) ([]*core.WdPoStTask, error) { + tasks := make([]*core.WdPoStTask, 0, len(taskIDs)) err := tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { for _, taskID := range taskIDs { - var task Task + var task core.WdPoStTask err := txn.Peek(kvstore.Key(makeWdPoStKey(state, taskID)), kvstore.LoadJSON(&task)) if errors.Is(err, kvstore.ErrKeyNotFound) { continue @@ -68,10 +77,10 @@ func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state TaskState, tas return tasks, err } -func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*Task, error) { +func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*core.WdPoStTask, error) { var ( taskID string - task *Task + task *core.WdPoStTask ) err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { rawInput, err := json.Marshal(input) @@ -80,11 +89,11 @@ func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*T } taskID = GenTaskID(rawInput) // check if task exists - err = txn.PeekAny( + _, err = txn.PeekAny( kvstore.LoadJSON(task), - kvstore.Key(makeWdPoStKey(TaskReadyToRun, taskID)), - kvstore.Key(makeWdPoStKey(TaskRunning, taskID)), - kvstore.Key(makeWdPoStKey(TaskFinished, taskID)), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), ) if err == nil { // return if it is exists @@ -95,11 +104,11 @@ func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*T } now := time.Now().Unix() - task = &Task{ + task = &core.WdPoStTask{ ID: taskID, Input: input, Output: nil, - tryNum: 0, + TryNum: 0, ErrorReason: "", WorkerName: "", StartedAt: 0, @@ -108,7 +117,7 @@ func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*T CreatedAt: uint64(now), UpdatedAt: uint64(now), } - return txn.PutJson([]byte(makeWdPoStKey(TaskReadyToRun, taskID)), task) + return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), task) }) if err == nil { @@ -117,28 +126,28 @@ func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*T return task, err } -func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName string) (allocatedTasks []AllocatedTask, err error) { - var readyToRun []*Task +func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName string) (allocatedTasks []core.WdPoStAllocatedTask, err error) { + var readyToRun []*core.WdPoStTask err = tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - readyToRun, err = tm.filter(ctx, txn, TaskReadyToRun, n, func(t *Task) bool { return true }) + readyToRun, err = tm.filter(ctx, txn, core.WdPoStTaskReadyToRun, n, func(_ *core.WdPoStTask) bool { return true }) if err != nil { return err } now := uint64(time.Now().Unix()) for _, task := range readyToRun { - task.tryNum++ + task.TryNum++ task.StartedAt = now task.WorkerName = workName task.HeartbeatAt = now task.UpdatedAt = now // Moving ready to run tasks to running tasks - if err := txn.Del([]byte(makeWdPoStKey(TaskReadyToRun, task.ID))); err != nil { + if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, task.ID))); err != nil { return err } - if err := txn.PutJson([]byte(makeWdPoStKey(TaskRunning, task.ID)), task); err != nil { + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskRunning, task.ID)), task); err != nil { return err } - allocatedTasks = append(allocatedTasks, AllocatedTask{ + allocatedTasks = append(allocatedTasks, core.WdPoStAllocatedTask{ ID: task.ID, Input: task.Input, }) @@ -148,7 +157,7 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName s if err == nil { for _, task := range readyToRun { - log.Infof("allocated wdPoSt task: %s; try_num: %d", task.ID, task.tryNum) + log.Infof("allocated wdPoSt task: %s; try_num: %d", task.ID, task.TryNum) } } return @@ -157,15 +166,15 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName s func (tm *kvTaskManager) Heartbeat(ctx context.Context, taskIDs []string, workerName string) error { err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { for _, taskID := range taskIDs { - var task Task - if err := txn.Peek([]byte(makeWdPoStKey(TaskRunning, taskID)), kvstore.LoadJSON(&task)); err != nil { + var task core.WdPoStTask + if err := txn.Peek([]byte(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), kvstore.LoadJSON(&task)); err != nil { return err } now := uint64(time.Now().Unix()) task.HeartbeatAt = now task.WorkerName = workerName task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(TaskRunning, taskID)), &task); err != nil { + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), &task); err != nil { return err } } @@ -179,8 +188,8 @@ func (tm *kvTaskManager) Heartbeat(ctx context.Context, taskIDs []string, worker func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - runningKey := []byte(makeWdPoStKey(TaskRunning, taskID)) - var task Task + runningKey := []byte(makeWdPoStKey(core.WdPoStTaskRunning, taskID)) + var task core.WdPoStTask if err := txn.Peek(runningKey, kvstore.LoadJSON(&task)); err != nil { return err } @@ -192,7 +201,7 @@ func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stag task.ErrorReason = errorReason task.FinishedAt = now task.UpdatedAt = now - return txn.PutJson([]byte(makeWdPoStKey(TaskFinished, taskID)), &task) + return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), &task) }) if err == nil { @@ -206,12 +215,12 @@ func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stag } func (tm *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time.Duration, limit uint32) error { - var shouldDead []*Task + var shouldDead []*core.WdPoStTask shouldDeadTime := time.Now().Add(-heartbeatTimeout) err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { var err error - shouldDead, err = tm.filter(ctx, txn, TaskRunning, limit, func(t *Task) bool { + shouldDead, err = tm.filter(ctx, txn, core.WdPoStTaskRunning, limit, func(t *core.WdPoStTask) bool { return t.HeartbeatAt > 0 && time.Unix(int64(t.HeartbeatAt), 0).Before(shouldDeadTime) }) if err != nil { @@ -219,14 +228,14 @@ func (tm *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time } now := uint64(time.Now().Unix()) for _, task := range shouldDead { - if err := txn.Del([]byte(makeWdPoStKey(TaskRunning, task.ID))); err != nil { + if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskRunning, task.ID))); err != nil { return err } task.FinishedAt = now task.Output = nil task.ErrorReason = "heartbeat timeout" task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(TaskFinished, task.ID)), task); err != nil { + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskFinished, task.ID)), task); err != nil { return err } } @@ -237,19 +246,19 @@ func (tm *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time } func (tm *kvTaskManager) CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error { - var shouldClean []*Task + var shouldClean []*core.WdPoStTask shouldCleanTime := time.Now().Add(-taskLifetime) err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { var err error - shouldClean, err = tm.filter(ctx, txn, TaskFinished, limit, func(t *Task) bool { + shouldClean, err = tm.filter(ctx, txn, core.WdPoStTaskFinished, limit, func(t *core.WdPoStTask) bool { return time.Unix(int64(t.CreatedAt), 0).Before(shouldCleanTime) }) if err != nil { return err } for _, task := range shouldClean { - if err := txn.Del([]byte(makeWdPoStKey(TaskFinished, task.ID))); err != nil { + if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskFinished, task.ID))); err != nil { return err } } @@ -265,11 +274,11 @@ func (tm *kvTaskManager) CleanupExpiredTasks(ctx context.Context, taskLifetime t } func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error { - var shouldRetry []*Task + var shouldRetry []*core.WdPoStTask err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { var err error - shouldRetry, err = tm.filter(ctx, txn, TaskFinished, limit, func(t *Task) bool { - return len(t.ErrorReason) != 0 && t.tryNum > maxTry + shouldRetry, err = tm.filter(ctx, txn, core.WdPoStTaskFinished, limit, func(t *core.WdPoStTask) bool { + return len(t.ErrorReason) != 0 && t.TryNum > maxTry }) if err != nil { return err @@ -281,7 +290,7 @@ func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uin task.StartedAt = 0 task.FinishedAt = 0 task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(TaskFinished, task.ID)), task); err != nil { + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskFinished, task.ID)), task); err != nil { return err } } @@ -290,17 +299,65 @@ func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uin if err == nil { for _, task := range shouldRetry { - log.Debugf("retry wdPoSt task: %d; try_num: %d, error_reason: %s", task.ID, task.tryNum) + log.Debugf("retry wdPoSt task: %d; try_num: %d, error_reason: %s", task.ID, task.TryNum) + } + } + + return err +} + +func (tm *kvTaskManager) Reset(ctx context.Context, taskID string) error { + var task core.WdPoStTask + now := uint64(time.Now().Unix()) + + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + key, err := txn.PeekAny( + kvstore.LoadJSON(&task), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), + ) + if err != nil { + return fmt.Errorf("load task from db: %w. taskID: %s", err, taskID) } + + task.CreatedAt = now + task.StartedAt = 0 + task.TryNum = 0 + task.Output = nil + task.ErrorReason = "" + task.FinishedAt = 0 + task.HeartbeatAt = 0 + task.WorkerName = "" + task.UpdatedAt = now + + if err := txn.Del(key); err != nil { + return err + } + return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), &task) + }) + + if err == nil { + log.Infof("task is reset: %s", taskID) } return err } -func makeWdPoStPrefix(state TaskState) string { - return fmt.Sprintf("wdpost-%s-", state) +const ( + prefixTaskIDdelimiter = ":" +) + +func makeWdPoStPrefix(state core.WdPoStTaskState) string { + return string(state) +} + +func makeWdPoStKey(state core.WdPoStTaskState, taskID string) string { + return fmt.Sprintf("%s%s%s", makeWdPoStPrefix(state), prefixTaskIDdelimiter, taskID) } -func makeWdPoStKey(state TaskState, taskID string) string { - return fmt.Sprintf("%s%s", makeWdPoStPrefix(state), taskID) +//lint:ignore U1000 Ignore unused function +func splitKey(key string) (state core.WdPoStTaskState, taskID string) { + x := strings.SplitN(key, prefixTaskIDdelimiter, 2) + return core.WdPoStTaskState(x[0]), x[1] } diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go new file mode 100644 index 000000000..efdbaaa43 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go @@ -0,0 +1,18 @@ +package worker + +import ( + "testing" + + "github.com/ipfs-force-community/damocles/damocles-manager/core" + "github.com/stretchr/testify/require" +) + +func TestSplitKey(t *testing.T) { + for _, taskID := range []string{"normal123", "with-", "-", "-with", "wi-th", "with:xxx", ":xxx", ":"} { + for _, state := range []core.WdPoStTaskState{core.WdPoStTaskReadyToRun, core.WdPoStTaskRunning, core.WdPoStTaskFinished} { + actualState, actualTaskID := splitKey(makeWdPoStKey(state, taskID)) + require.Equalf(t, state, actualState, "test state for \"state: `%s`; taskID: `%s`\"", state, taskID) + require.Equalf(t, taskID, actualTaskID, "test taskID for \"state: `%s`; taskID: `%s`\"", state, taskID) + } + } +} diff --git a/damocles-manager/modules/impl/sectors/indexer_proxy.go b/damocles-manager/modules/impl/sectors/indexer_proxy.go index 221d6d234..fb70297ad 100644 --- a/damocles-manager/modules/impl/sectors/indexer_proxy.go +++ b/damocles-manager/modules/impl/sectors/indexer_proxy.go @@ -15,7 +15,7 @@ var _ core.SectorTypedIndexer = (*proxiedTypeIndexer)(nil) type proxiedTypeIndexer struct { indexType core.SectorIndexType - client core.SealerCliClient + client *core.SealerCliAPIClient } func (p *proxiedTypeIndexer) Find(ctx context.Context, sid abi.SectorID) (core.SectorAccessStores, bool, error) { @@ -31,7 +31,7 @@ func (p *proxiedTypeIndexer) Update(ctx context.Context, sid abi.SectorID, store return ErrProxiedTypedIndexerUnableForUpdating } -func NewProxiedIndexer(client core.SealerCliClient, storeMgr objstore.Manager) (core.SectorIndexer, error) { +func NewProxiedIndexer(client *core.SealerCliAPIClient, storeMgr objstore.Manager) (core.SectorIndexer, error) { return &proxiedIndexer{ client: client, storeMgr: storeMgr, @@ -39,7 +39,7 @@ func NewProxiedIndexer(client core.SealerCliClient, storeMgr objstore.Manager) ( } type proxiedIndexer struct { - client core.SealerCliClient + client *core.SealerCliAPIClient storeMgr objstore.Manager } diff --git a/damocles-manager/pkg/kvstore/kvstore.go b/damocles-manager/pkg/kvstore/kvstore.go index 882874e01..4ed46bb57 100644 --- a/damocles-manager/pkg/kvstore/kvstore.go +++ b/damocles-manager/pkg/kvstore/kvstore.go @@ -78,15 +78,15 @@ type TxnExt struct { Txn } -func (et TxnExt) PeekAny(f func(Val) error, keys ...Key) error { +func (et TxnExt) PeekAny(f func(Val) error, keys ...Key) (Key, error) { for _, k := range keys { err := et.Peek(k, f) if errors.Is(err, ErrKeyNotFound) { continue } - return err + return k, err } - return ErrKeyNotFound + return []byte{}, ErrKeyNotFound } func (et TxnExt) PutJson(k Key, v any) error { From f20e98b13e3e14bc3f8fcae7184a8260dcf800f3 Mon Sep 17 00:00:00 2001 From: tanlang Date: Tue, 4 Jul 2023 16:25:34 +0800 Subject: [PATCH 04/18] feat: add wdpost planner --- damocles-worker/src/config.rs | 3 + damocles-worker/src/rpc/sealer/mod.rs | 50 +++- damocles-worker/src/run.rs | 3 + damocles-worker/src/sealing/processor/mod.rs | 7 +- .../src/sealing/sealing_thread/task/event.rs | 8 +- .../sealing_thread/task/planner/mod.rs | 2 + .../sealing_thread/task/planner/wdpost.rs | 271 ++++++++++++++++++ .../src/sealing/sealing_thread/task/sector.rs | 8 +- damocles-worker/src/watchdog.rs | 3 +- 9 files changed, 346 insertions(+), 9 deletions(-) create mode 100644 damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs diff --git a/damocles-worker/src/config.rs b/damocles-worker/src/config.rs index 6e9a288a7..655544a54 100644 --- a/damocles-worker/src/config.rs +++ b/damocles-worker/src/config.rs @@ -209,6 +209,9 @@ pub struct Processors { /// section for unseal processor pub unseal: Option>, + + /// section for fetch processor + pub wdpost: Option>, } impl Processors { diff --git a/damocles-worker/src/rpc/sealer/mod.rs b/damocles-worker/src/rpc/sealer/mod.rs index 1f60149a0..6dd0c84b3 100644 --- a/damocles-worker/src/rpc/sealer/mod.rs +++ b/damocles-worker/src/rpc/sealer/mod.rs @@ -1,6 +1,8 @@ use std::collections::HashMap; use std::path::PathBuf; +use super::super::types::SealProof; +use crate::sealing::processor::ChallengeSeed; use fil_clock::ChainEpoch; use fil_types::{ActorID, PaddedPieceSize, SectorNumber}; use forest_cid::json::CidJson; @@ -9,9 +11,7 @@ use jsonrpc_derive::rpc; use serde::{Deserialize, Serialize}; use serde_repr::{Deserialize_repr, Serialize_repr}; use vc_processors::b64serde::{BytesArray32, BytesVec}; -use vc_processors::fil_proofs::PaddedBytesAmount; - -use super::super::types::SealProof; +use vc_processors::fil_proofs::{Commitment, PaddedBytesAmount, RegisteredPoStProof, SectorId, SnarkProof}; /// type alias for BytesArray32 pub type Randomness = BytesArray32; @@ -377,6 +377,44 @@ pub struct SectorUnsealInfo { pub private_info: SectorPrivateInfo, } +#[derive(Deserialize, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct PoStSectorInfo { + pub sector_id: SectorId, + pub comm_r: Commitment, + pub access_instance: String, +} + +#[derive(Deserialize, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct WdPostTaskInfo { + pub miner_id: ActorID, + pub deadline_id: u64, + pub sectors: Vec, + pub seed: ChallengeSeed, + pub proof_type: RegisteredPoStProof, + pub instance: String, +} + +#[derive(Deserialize, Clone, Serialize)] +pub enum WdpostState { + Assigned, + Generating, + Generated, + Failed, + Done, + Error, +} + +#[derive(Deserialize, Clone, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct WdPoStResult { + pub state: WdpostState, + pub error: Option, + pub proofs: Option>, + pub faults: Option>, +} + /// defines the SealerRpc service #[rpc] pub trait Sealer { @@ -467,4 +505,10 @@ pub trait Sealer { #[rpc(name = "Venus.AcquireUnsealDest")] fn acquire_unseal_dest(&self, id: SectorID, piece_cid: CidJson) -> Result>; + + #[rpc(name = "Venus.WdPoStAllocateTasks")] + fn allocate_wd_post_task(&self, spec: AllocateSectorSpec) -> Result>; + + #[rpc(name = "Venus.WdPoStHeartbeatTask")] + fn wd_post_heartbeat(&self, miner_id: ActorID, deadline_id: u64, result: WdPoStResult) -> Result<()>; } diff --git a/damocles-worker/src/run.rs b/damocles-worker/src/run.rs index 86f741661..bcaeb416c 100644 --- a/damocles-worker/src/run.rs +++ b/damocles-worker/src/run.rs @@ -313,6 +313,8 @@ fn start_processors(cfg: &config::Config, locks: &Arc) -> Result let unseal: processor::ArcUnsealProcessor = construct_sub_processor!(unseal, cfg, locks); + let wdpost: processor::ArcWdPostProcessor = construct_sub_processor!(wdpost, cfg, locks); + Ok(GlobalProcessors { add_pieces, tree_d, @@ -323,6 +325,7 @@ fn start_processors(cfg: &config::Config, locks: &Arc) -> Result snap_prove, transfer, unseal, + wdpost, }) } diff --git a/damocles-worker/src/sealing/processor/mod.rs b/damocles-worker/src/sealing/processor/mod.rs index e8fd79110..58397e886 100644 --- a/damocles-worker/src/sealing/processor/mod.rs +++ b/damocles-worker/src/sealing/processor/mod.rs @@ -5,9 +5,9 @@ use std::sync::Arc; pub use vc_processors::{ builtin::tasks::{ AddPieces as AddPiecesInput, SnapEncode as SnapEncodeInput, SnapProve as SnapProveInput, Transfer as TransferInput, TransferItem, - TransferOption, TransferRoute, TransferStoreInfo, TreeD as TreeDInput, Unseal as UnsealInput, C2 as C2Input, PC1 as PC1Input, - PC2 as PC2Input, STAGE_NAME_C1, STAGE_NAME_C2, STAGE_NAME_PC1, STAGE_NAME_PC2, STAGE_NAME_SNAP_ENCODE, STAGE_NAME_SNAP_PROVE, - STAGE_NAME_TRANSFER, STAGE_NAME_TREED, + TransferOption, TransferRoute, TransferStoreInfo, TreeD as TreeDInput, Unseal as UnsealInput, WindowPoSt, C2 as C2Input, + PC1 as PC1Input, PC2 as PC2Input, STAGE_NAME_C1, STAGE_NAME_C2, STAGE_NAME_PC1, STAGE_NAME_PC2, STAGE_NAME_SNAP_ENCODE, + STAGE_NAME_SNAP_PROVE, STAGE_NAME_TRANSFER, STAGE_NAME_TREED, }, core::{Processor, Task as Input}, }; @@ -26,3 +26,4 @@ pub type ArcSnapEncodeProcessor = ArcProcessor; pub type ArcSnapProveProcessor = ArcProcessor; pub type ArcTransferProcessor = ArcProcessor; pub type ArcUnsealProcessor = ArcProcessor; +pub type ArcWdPostProcessor = ArcProcessor; diff --git a/damocles-worker/src/sealing/sealing_thread/task/event.rs b/damocles-worker/src/sealing/sealing_thread/task/event.rs index 5eefaa175..84ac9da6b 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/event.rs +++ b/damocles-worker/src/sealing/sealing_thread/task/event.rs @@ -12,10 +12,12 @@ use crate::sealing::processor::{ }; use crate::{logging::trace, metadb::MaybeDirty}; use crate::{ - rpc::sealer::{AllocatedSector, Deals, SectorRebuildInfo, SectorUnsealInfo, Seed, Ticket}, + rpc::sealer::{AllocatedSector, Deals, SectorRebuildInfo, SectorUnsealInfo, Seed, Ticket, WdPostTaskInfo}, sealing::sealing_thread::task::sector::UnsealInput, }; +use vc_processors::builtin::tasks::WindowPoStOutput; + pub enum Event { SetState(State), @@ -314,6 +316,10 @@ impl Event { ); } + Self::WdPostGenerated(out) => { + replace!(s.phases.wd_post_out, out); + } + _ => {} }; } diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs b/damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs index 63618c8e7..d9b84c5db 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs +++ b/damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs @@ -17,6 +17,8 @@ mod common; mod unseal; +mod wdpost; + type ExecResult = Result; macro_rules! plan { diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs b/damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs new file mode 100644 index 000000000..42929dfeb --- /dev/null +++ b/damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs @@ -0,0 +1,271 @@ +use super::super::{call_rpc, Event, State, Task}; +use super::{plan, ExecResult, Planner}; +use crate::logging::{error, warn}; +use crate::rpc::sealer::{AllocateSectorSpec, SectorID, WdPoStResult, WdpostState}; +use crate::sealing::failure::MapErrToFailure; +use crate::sealing::failure::{Failure, IntoFailure}; +use anyhow::{anyhow, Context, Result}; +use std::time::Duration; +use tracing::debug; +use vc_processors::builtin::tasks::{PoStReplicaInfo, WindowPoSt}; + +pub struct WdPostPlanner; + +impl Planner for WdPostPlanner { + fn plan(&self, evt: &Event, st: &State) -> Result { + let next = plan! { + evt, + st, + + State::Empty => { + // alloc wdpost task + Event::AcquireWdPostTask(_) => State::Allocated, + }, + State::Allocated => { + // gen prove and report persistent + Event::WdPostGenerated(_) => State::WdPostGenerated, + }, + State::WdPostGenerated => { + // verify prove + Event::Finish => State::Finished, + }, + }; + + Ok(next) + } + + fn exec(&self, task: &mut Task<'_>) -> Result, Failure> { + let state = task.sector.state; + let inner = WdPost { task }; + + match state { + State::Empty => inner.acquire(), + State::Allocated => inner.generate(), + State::WdPostGenerated => inner.upload(), + other => Err(anyhow!("unexpected state: {:?} in window post planner", other).abort()), + } + .map(From::from) + } +} + +struct WdPost<'c, 't> { + task: &'t mut Task<'c>, +} + +impl WdPost<'_, '_> { + fn acquire(&self) -> ExecResult { + let maybe_res = call_rpc!( + self.task.ctx.global.rpc, + allocate_wd_post_task, + AllocateSectorSpec { + allowed_miners: Some(self.task.sealing_config.allowed_miners.clone()), + allowed_proof_types: Some(self.task.sealing_config.allowed_proof_types.clone()), + }, + ); + + let maybe_allocated = match maybe_res { + Ok(a) => a, + Err(e) => { + warn!( + "window PoST task is not allocated yet, so we can retry even though we got the err {:?}", + e + ); + return Ok(Event::Idle); + } + }; + + let allocated = match maybe_allocated { + Some(a) => a, + None => return Ok(Event::Idle), + }; + + Ok(Event::AcquireWdPostTask(allocated)) + } + + fn upload(&self) -> ExecResult { + let out = self + .task + .sector + .phases + .wd_post_out + .clone() + .context("wdpost out info not found") + .abort()?; + + let wdpost_res = WdPoStResult { + state: WdpostState::Done, + proofs: Some(out.proofs), + faults: Some(out.faults), + error: None, + }; + + self.report(wdpost_res); + + Ok(Event::Finish) + } + + fn generate(&self) -> ExecResult { + let task_info = self + .task + .sector + .phases + .wd_post_in + .as_ref() + .context("wdpost info not found") + .abort()?; + + let instance_name = &task_info.instance; + debug!("find access store named {}", instance_name); + let instance = self + .task + .ctx + .global + .attached + .get(instance_name) + .with_context(|| format!("get access store instance named {}", instance_name)) + .perm()?; + + // get sealed path and cache path + let replica = task_info + .sectors + .iter() + .map(|sector| { + let sector_id = &SectorID { + miner: task_info.miner_id, + number: sector.sector_id.into(), + }; + + let sealed_temp = self.task.sealed_file(sector_id); + let sealed_rel = sealed_temp.rel(); + + let cache_temp = self.task.cache_dir(sector_id); + let cache_rel = cache_temp.rel(); + + let sealed_path = instance + .uri(sealed_rel) + .with_context(|| format!("get uri for sealed file {:?} in {}", sealed_rel, instance_name))?; + let cache_path = instance + .uri(cache_rel) + .with_context(|| format!("get uri for cache file {:?} in {}", cache_rel, instance_name))?; + + let sector_id = sector.sector_id; + let replica = PoStReplicaInfo { + sector_id, + comm_r: sector.comm_r, + cache_dir: cache_path, + sealed_file: sealed_path, + }; + Ok(replica) + }) + .collect::>>() + .perm()?; + + let post_in = WindowPoSt { + miner_id: task_info.miner_id, + proof_type: task_info.proof_type, + replicas: replica, + seed: task_info.seed, + }; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (tx_res, mut rx_res) = tokio::sync::oneshot::channel::>(); + let (tx_sync, rx_sync) = tokio::sync::oneshot::channel(); + + let rpc = self.task.ctx.global.rpc.clone(); + let miner_id = task_info.miner_id; + let deadline_id = task_info.deadline_id; + + rt.spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(20)); + + let mut rep = WdPoStResult { + state: WdpostState::Generating, + proofs: None, + faults: None, + error: None, + }; + + let report = |rep: WdPoStResult| { + if let Err(e) = call_rpc!(rpc, wd_post_heartbeat, miner_id, deadline_id, rep,) { + error!("report wdpost result failed: {:?}", e); + } + }; + + loop { + tokio::select! { + res = &mut rx_res => { + match res { + Ok(Ok(_)) => { + rep.state = WdpostState::Generated; + report(rep) + } + Ok(Err(e)) => { + rep.state = WdpostState::Failed; + rep.error = Some(format!("{:?}", e)); + report(rep) + } + Err(_) => { + error!("receive finish signal failed"); + } + } + break; + } + _ = interval.tick() => { + report(rep.clone()); + } + } + } + tx_sync.send(()).unwrap(); + }); + + let _rt_guard = rt.enter(); + + let out_maybe = self + .task + .ctx + .global + .processors + .wdpost + .process(post_in) + .context("generate window post"); + + // notify crond + match &out_maybe { + Ok(_) => { + if tx_res.send(Ok(())).is_err() { + warn!("send finish signal failed"); + } + } + Err(e) => { + if tx_res.send(Err(anyhow!("generate window post failed: {:?}", e))).is_err() { + warn!("send finish signal failed"); + } + warn!("generate window post failed: {:?}", e); + } + }; + + // wait for crond to finish + rx_sync.blocking_recv().unwrap(); + + let out = out_maybe.context("generate window post").temp()?; + + Ok(Event::WdPostGenerated(out)) + } + + fn report(&self, res: WdPoStResult) { + if let Some(task_info) = self.task.sector.phases.wd_post_in.as_ref() { + let resp = call_rpc!( + self.task.ctx.global.rpc, + wd_post_heartbeat, + task_info.miner_id, + task_info.deadline_id, + res, + ); + if let Err(e) = resp { + warn!("report wdpost result failed: {:?}", e); + } + } else { + warn!("wdpost info not found"); + } + } +} diff --git a/damocles-worker/src/sealing/sealing_thread/task/sector.rs b/damocles-worker/src/sealing/sealing_thread/task/sector.rs index 9d5470dee..e603fb850 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/sector.rs +++ b/damocles-worker/src/sealing/sealing_thread/task/sector.rs @@ -6,11 +6,12 @@ use serde_repr::{Deserialize_repr, Serialize_repr}; pub use fil_clock::ChainEpoch; pub use fil_types::{InteractiveSealRandomness, PieceInfo as DealInfo, Randomness}; -use crate::rpc::sealer::{AllocatedSector, Deals, SectorPrivateInfo, SectorPublicInfo, Seed, Ticket}; +use crate::rpc::sealer::{AllocatedSector, Deals, SectorPrivateInfo, SectorPublicInfo, Seed, Ticket, WdPostTaskInfo}; use crate::sealing::processor::{ PieceInfo, ProverId, SealCommitPhase1Output, SealCommitPhase2Output, SealPreCommitPhase1Output, SealPreCommitPhase2Output, SectorId, SnapEncodeOutput, }; +use vc_processors::builtin::tasks::WindowPoStOutput; const CURRENT_SECTOR_VERSION: u32 = 1; @@ -139,6 +140,11 @@ pub struct Phases { // unseal pub unseal_in: Option, + + // window PoST + pub wd_post_in: Option, + + pub wd_post_out: Option, } #[derive(Debug, Deserialize, Serialize)] diff --git a/damocles-worker/src/watchdog.rs b/damocles-worker/src/watchdog.rs index 3e826d123..3a6e09b48 100644 --- a/damocles-worker/src/watchdog.rs +++ b/damocles-worker/src/watchdog.rs @@ -15,7 +15,7 @@ use crate::{ sealing::{ processor::{ ArcAddPiecesProcessor, ArcC2Processor, ArcPC1Processor, ArcPC2Processor, ArcSnapEncodeProcessor, ArcSnapProveProcessor, - ArcTransferProcessor, ArcTreeDProcessor, ArcUnsealProcessor, + ArcTransferProcessor, ArcTreeDProcessor, ArcUnsealProcessor, ArcWdPostProcessor, }, resource::Pool, }, @@ -61,6 +61,7 @@ pub struct GlobalProcessors { pub snap_prove: ArcSnapProveProcessor, pub transfer: ArcTransferProcessor, pub unseal: ArcUnsealProcessor, + pub wdpost: ArcWdPostProcessor, } impl Module for Box { From e1cd635c5c6acfc7f553a4b5a7ff988ddb73f278 Mon Sep 17 00:00:00 2001 From: tanlang Date: Tue, 4 Jul 2023 17:22:39 +0800 Subject: [PATCH 05/18] chore: update go mod --- damocles-manager/go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/damocles-manager/go.mod b/damocles-manager/go.mod index 7809a6ae8..debde467d 100644 --- a/damocles-manager/go.mod +++ b/damocles-manager/go.mod @@ -5,7 +5,7 @@ go 1.18 require ( contrib.go.opencensus.io/exporter/prometheus v0.4.0 github.com/BurntSushi/toml v1.2.1 - github.com/cespare/xxhash/v2 v2.2.0 + github.com/cespare/xxhash v1.1.0 github.com/dgraph-io/badger/v2 v2.2007.3 github.com/docker/go-units v0.5.0 github.com/dtynn/dix v0.1.2 @@ -64,7 +64,7 @@ require ( github.com/benbjohnson/clock v1.3.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bluele/gcache v0.0.0-20190518031135-bc40bd653833 // indirect - github.com/cespare/xxhash v1.1.0 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cilium/ebpf v0.4.0 // indirect github.com/containerd/cgroups v1.0.4 // indirect github.com/coreos/go-systemd/v22 v22.4.0 // indirect From 25ec02b954dc6debde52160da6d539b31f1f5781 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Thu, 6 Jul 2023 03:08:40 +0800 Subject: [PATCH 06/18] fix: badger iter does not close the txn --- damocles-manager/pkg/kvstore/badger.go | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/damocles-manager/pkg/kvstore/badger.go b/damocles-manager/pkg/kvstore/badger.go index 6ddf9f428..020b64ce5 100644 --- a/damocles-manager/pkg/kvstore/badger.go +++ b/damocles-manager/pkg/kvstore/badger.go @@ -113,13 +113,23 @@ func (b *BadgerKVStore) Scan(ctx context.Context, prefix Prefix) (it Iter, err e txn := b.db.NewTransaction(false) iter := txn.NewIterator(badger.DefaultIteratorOptions) - return &BadgerIter{ - txn: txn, - iter: iter, - seeked: false, - valid: false, - prefix: prefix, - }, nil + return &BadgerIterWithoutTrans{ + BadgerIter: &BadgerIter{ + txn: txn, + iter: iter, + seeked: false, + valid: false, + prefix: prefix, + }}, nil +} + +type BadgerIterWithoutTrans struct { + *BadgerIter +} + +func (bi *BadgerIterWithoutTrans) Close() { + bi.BadgerIter.Close() + bi.txn.Discard() } var _ Txn = (*BadgerTxn)(nil) @@ -236,7 +246,6 @@ func (bi *BadgerIter) View(ctx context.Context, f func(Val) error) error { func (bi *BadgerIter) Close() { bi.iter.Close() - bi.txn.Discard() } var _ DB = (*badgerDB)(nil) From 38ae7c01ea3007058f3364ac1a64ec2060f58d94 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Thu, 6 Jul 2023 09:59:19 +0800 Subject: [PATCH 07/18] feat: minor refactor wdpost --- .../cmd/damocles-manager/daemon.go | 13 +- .../internal/util_sealer_proving.go | 18 +-- damocles-manager/core/api.go | 4 +- damocles-manager/core/client_gen.go | 9 +- damocles-manager/core/prover.go | 4 +- damocles-manager/core/types_wdpost.go | 28 ++++- damocles-manager/dep/prover.go | 37 +++++- damocles-manager/dep/sealer.go | 33 +++-- damocles-manager/dep/sealer_constructor.go | 18 --- .../modules/impl/prover/ext/prover.go | 119 +++++++++++++----- .../modules/impl/prover/prover.go | 87 ------------- .../modules/impl/prover/prover_fake.go | 35 +++--- .../modules/impl/prover/prover_prod.go | 53 +++++--- .../modules/impl/prover/worker/prover.go | 107 +++++++++++----- .../modules/impl/prover/worker/rpc.go | 28 ++++- .../modules/impl/prover/worker/task_mgr_kv.go | 21 +++- damocles-manager/modules/miner/proof_event.go | 17 ++- damocles-manager/modules/poster/runner.go | 6 +- damocles-manager/modules/sealer/sealer_cli.go | 11 +- damocles-manager/ver/ver.go | 4 + 20 files changed, 393 insertions(+), 259 deletions(-) delete mode 100644 damocles-manager/modules/impl/prover/prover.go diff --git a/damocles-manager/cmd/damocles-manager/daemon.go b/damocles-manager/cmd/damocles-manager/daemon.go index 632999620..1154e8af5 100644 --- a/damocles-manager/cmd/damocles-manager/daemon.go +++ b/damocles-manager/cmd/damocles-manager/daemon.go @@ -90,6 +90,11 @@ var daemonRunCmd = &cli.Command{ Value: false, Usage: "enable external prover", }, + &cli.BoolFlag{ + Name: "worker-prover", + Value: false, + Usage: "enable worker prover", + }, daemonRunProxyFlag, daemonRunProxySectorIndexerOffFlag, }, @@ -101,6 +106,10 @@ var daemonRunCmd = &cli.Command{ proxyOpt := dep.ProxyOptions{ EnableSectorIndexer: !cctx.Bool(daemonRunProxySectorIndexerOffFlag.Name), } + extProver, workerProver := cctx.Bool("ext-prover"), cctx.Bool("worker-prover") + if extProver && workerProver { + return fmt.Errorf("ext-prover and worker-prover are mutually exclusive") + } var apiService *APIService stopper, err := dix.New( @@ -119,7 +128,9 @@ var daemonRunCmd = &cli.Command{ dep.Miner(), ), dep.Gateway(), - dix.If(cctx.Bool("ext-prover"), dep.ExtProver()), + dix.If(extProver, dep.ExtProver()), + dix.If(workerProver, dep.WorkerProver()), + dix.If(!workerProver, dep.DisableWorkerProver()), dep.Sealer(), dix.Override(new(*APIService), NewAPIService), dix.Populate(dep.InvokePopulate, &apiService), diff --git a/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go b/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go index 53430f975..7df771831 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go @@ -29,8 +29,8 @@ import ( "github.com/filecoin-project/venus/venus-shared/actors/builtin/miner" "github.com/filecoin-project/venus/venus-shared/types" + ffi "github.com/filecoin-project/filecoin-ffi" "github.com/ipfs-force-community/damocles/damocles-manager/core" - "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover" "github.com/ipfs-force-community/damocles/damocles-manager/modules/policy" "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" chainAPI "github.com/ipfs-force-community/damocles/damocles-manager/pkg/chain" @@ -685,8 +685,8 @@ var utilSealerProvingSimulateWdPoStCmd = &cli.Command{ if err != nil { return err } - - partitions, err := api.Chain.StateMinerPartitions(ctx, maddr, cctx.Uint64("ddl-idx"), ts.Key()) + ddlIdx := cctx.Uint64("ddl-idx") + partitions, err := api.Chain.StateMinerPartitions(ctx, maddr, ddlIdx, ts.Key()) if err != nil { return fmt.Errorf("get parttion info failed: %w", err) } @@ -752,7 +752,7 @@ var utilSealerProvingSimulateWdPoStCmd = &cli.Command{ return fmt.Errorf("convert to winning post proof: %w", err) } - err = api.Damocles.SimulateWdPoSt(ctx, maddr, ppt, proofSectors, rand) + err = api.Damocles.SimulateWdPoSt(ctx, ddlIdx, maddr, ppt, proofSectors, rand) if err != nil { return err } @@ -877,7 +877,8 @@ var utilSealerProvingWinningVanillaCmd = &cli.Command{ slog.Infof("commR: %v", commR) randomness := make(abi.PoStRandomness, abi.RandomnessLength) - challenges, err := prover.Prover.GeneratePoStFallbackSectorChallenges(actx, abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, []abi.SectorNumber{sectorID.Number}) + + challenges, err := ffi.GeneratePoStFallbackSectorChallenges(abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, []abi.SectorNumber{sectorID.Number}) if err != nil { return fmt.Errorf("generate challenge for sector %s: %w", sealedFileName, err) } @@ -889,7 +890,7 @@ var utilSealerProvingWinningVanillaCmd = &cli.Command{ slog.Infof("%d challenge generated", len(challenge)) - vannilla, err := prover.Prover.GenerateSingleVanillaProof(actx, core.FFIPrivateSectorInfo{ + vannilla, err := ffi.GenerateSingleVanillaProof(core.FFIPrivateSectorInfo{ SectorInfo: sectorInfo, PoStProofType: abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, CacheDirPath: cacheDirPath, @@ -901,14 +902,15 @@ var utilSealerProvingWinningVanillaCmd = &cli.Command{ slog.Infof("vannilla generated with %d bytes", len(vannilla)) - proofs, err := prover.Prover.GenerateWinningPoStWithVanilla(actx, abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, [][]byte{vannilla}) + proofs, err := ffi.GenerateWinningPoStWithVanilla(abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, [][]byte{vannilla}) if err != nil { return fmt.Errorf("generate winning post with vannilla for %s: %w", sealedFileName, err) } slog.Infof("proof generated with %d bytes", len(proofs[0].ProofBytes)) - verified, err := prover.Verifier.VerifyWinningPoSt(actx, core.WinningPoStVerifyInfo{ + randomness[31] &= 0x3f + verified, err := ffi.VerifyWinningPoSt(core.WinningPoStVerifyInfo{ Randomness: randomness, Proofs: proofs, ChallengedSectors: []core.SectorInfo{sectorInfo}, diff --git a/damocles-manager/core/api.go b/damocles-manager/core/api.go index 3f57a86d6..f0c26aa04 100644 --- a/damocles-manager/core/api.go +++ b/damocles-manager/core/api.go @@ -102,7 +102,7 @@ type SealerCliAPI interface { CheckProvable(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) - SimulateWdPoSt(context.Context, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error + SimulateWdPoSt(context.Context, uint64, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error SnapUpPreFetch(ctx context.Context, mid abi.ActorID, dlindex *uint64) (*SnapUpFetchResult, error) @@ -154,7 +154,7 @@ type MinerAPI interface { type WorkerWdPoStAPI interface { WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs []string, workerName string) error - WdPoStAllocateTasks(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) + WdPoStAllocateTasks(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error WdPoStResetTask(ctx context.Context, taskID string) error WdPoStAllTasks(ctx context.Context) ([]*WdPoStTask, error) diff --git a/damocles-manager/core/client_gen.go b/damocles-manager/core/client_gen.go index c29808b2f..ca9e7ba9e 100644 --- a/damocles-manager/core/client_gen.go +++ b/damocles-manager/core/client_gen.go @@ -4,6 +4,7 @@ package core import ( "context" + "github.com/filecoin-project/go-address" "github.com/filecoin-project/go-bitfield" "github.com/filecoin-project/go-state-types/abi" @@ -120,7 +121,7 @@ type SealerCliAPIClient struct { ImportSector func(ctx context.Context, ws SectorWorkerState, state *SectorState, override bool) (bool, error) RestoreSector func(ctx context.Context, sid abi.SectorID, forced bool) (Meta, error) CheckProvable func(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) - SimulateWdPoSt func(context.Context, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error + SimulateWdPoSt func(context.Context, uint64, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error SnapUpPreFetch func(ctx context.Context, mid abi.ActorID, dlindex *uint64) (*SnapUpFetchResult, error) SnapUpCandidates func(ctx context.Context, mid abi.ActorID) ([]*bitfield.BitField, error) SnapUpCancelCommitment func(ctx context.Context, sid abi.SectorID) error @@ -166,7 +167,7 @@ var UnavailableSealerCliAPIClient = SealerCliAPIClient{ CheckProvable: func(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) { panic("SealerCliAPI client unavailable") }, - SimulateWdPoSt: func(context.Context, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error { + SimulateWdPoSt: func(context.Context, uint64, address.Address, abi.RegisteredPoStProof, []builtin.ExtendedSectorInfo, abi.PoStRandomness) error { panic("SealerCliAPI client unavailable") }, SnapUpPreFetch: func(ctx context.Context, mid abi.ActorID, dlindex *uint64) (*SnapUpFetchResult, error) { @@ -265,7 +266,7 @@ var UnavailableMinerAPIClient = MinerAPIClient{ // WorkerWdPoStAPIClient is generated client for WorkerWdPoStAPI interface. type WorkerWdPoStAPIClient struct { WdPoStHeartbeatTask func(ctx context.Context, runningTaskIDs []string, workerName string) error - WdPoStAllocateTasks func(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) + WdPoStAllocateTasks func(ctx context.Context, num uint32, workerName string) (allocatedTasks []WdPoStAllocatedTask, err error) WdPoStFinishTask func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error WdPoStResetTask func(ctx context.Context, taskID string) error WdPoStAllTasks func(ctx context.Context) ([]*WdPoStTask, error) @@ -276,7 +277,7 @@ var UnavailableWorkerWdPoStAPIClient = WorkerWdPoStAPIClient{ WdPoStHeartbeatTask: func(ctx context.Context, runningTaskIDs []string, workerName string) error { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStAllocateTasks: func(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) { + WdPoStAllocateTasks: func(ctx context.Context, num uint32, workerName string) (allocatedTasks []WdPoStAllocatedTask, err error) { panic("WorkerWdPoStAPI client unavailable") }, WdPoStFinishTask: func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { diff --git a/damocles-manager/core/prover.go b/damocles-manager/core/prover.go index ad8e3c065..605d94933 100644 --- a/damocles-manager/core/prover.go +++ b/damocles-manager/core/prover.go @@ -57,8 +57,8 @@ type Verifier interface { type Prover interface { AggregateSealProofs(ctx context.Context, aggregateInfo AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) - GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) - GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) + GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) + GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, ppt abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*FallbackChallenges, error) GenerateSingleVanillaProof(ctx context.Context, replica FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go index 1107c391b..0238d8165 100644 --- a/damocles-manager/core/types_wdpost.go +++ b/damocles-manager/core/types_wdpost.go @@ -4,9 +4,24 @@ import ( "context" "time" + "github.com/filecoin-project/go-state-types/abi" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" ) +type WdPoStSectorInfo struct { + SectorID abi.SectorNumber + CommR [32]byte + Upgrade bool // is upgrade sector + Accesses SectorAccessStores +} + +type WdPoStInput struct { + Sectors []WdPoStSectorInfo + MinerID abi.ActorID + ProofType abi.RegisteredPoStProof + Seed [32]byte +} + type WdPoStTaskState string const ( @@ -17,7 +32,7 @@ const ( type WdPoStTask struct { ID string - Input stage.WindowPoSt + Input WdPoStInput Output *stage.WindowPoStOutput TryNum uint32 ErrorReason string @@ -43,14 +58,19 @@ func (t *WdPoStTask) Finished(maxTry uint32) bool { type WdPoStAllocatedTask struct { ID string - Input stage.WindowPoSt + Input WdPoStInput +} + +type AllocateWdPoStTaskSpec struct { + AllowedMiners []abi.ActorID + AllowedProofTypes []abi.RegisteredPoStProof } type WorkerWdPoStTaskManager interface { All(ctx context.Context, filter func(*WdPoStTask) bool) ([]*WdPoStTask, error) ListByTaskIDs(ctx context.Context, state WdPoStTaskState, taskIDs ...string) ([]*WdPoStTask, error) - Create(ctx context.Context, input stage.WindowPoSt) (*WdPoStTask, error) - AllocateTasks(ctx context.Context, num uint32, workName string) (allocatedTasks []WdPoStAllocatedTask, err error) + Create(ctx context.Context, input WdPoStInput) (*WdPoStTask, error) + AllocateTasks(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) Heartbeat(ctx context.Context, taskIDs []string, workerName string) error Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error MakeTasksDie(ctx context.Context, shouldDeadDur time.Duration, limit uint32) error diff --git a/damocles-manager/dep/prover.go b/damocles-manager/dep/prover.go index 98abe4ddb..2ab73b497 100644 --- a/damocles-manager/dep/prover.go +++ b/damocles-manager/dep/prover.go @@ -12,7 +12,13 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/modules" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover/ext" + proverworker "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover/worker" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/confmgr" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" +) + +type ( + WorkerProverStore kvstore.KVStore ) func ExtProver() dix.Option { @@ -23,8 +29,23 @@ func ExtProver() dix.Option { ) } -func BuildExtProver(gctx GlobalContext, lc fx.Lifecycle, cfg *modules.ProcessorConfig) (*ext.Prover, error) { - p, err := ext.New(gctx, cfg.WdPost, cfg.WinPost) +func WorkerProver() dix.Option { + return dix.Options( + dix.Override(new(WorkerProverStore), BuildWorkerProverStore), + dix.Override(new(core.WorkerWdPoStTaskManager), BuildWorkerWdPoStTaskManager), + dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewWdPoStAPIImpl), + dix.Override(new(core.Prover), proverworker.NewProver), + ) +} + +func DisableWorkerProver() dix.Option { + return dix.Options( + dix.Override(new(core.WorkerWdPoStAPI), &proverworker.UnavailableWdPoStAPIImpl{}), + ) +} + +func BuildExtProver(gctx GlobalContext, lc fx.Lifecycle, sectorTracker core.SectorTracker, cfg *modules.ProcessorConfig) (*ext.Prover, error) { + p, err := ext.New(gctx, sectorTracker, cfg.WdPost, cfg.WinPost) if err != nil { return nil, fmt.Errorf("construct ext prover: %w", err) } @@ -71,3 +92,15 @@ func ProvideExtProverConfig(gctx GlobalContext, lc fx.Lifecycle, cfgmgr confmgr. return &cfg, nil } + +func BuildWorkerProverStore(gctx GlobalContext, db UnderlyingDB) (WorkerProverStore, error) { + return db.OpenCollection(gctx, "prover") +} + +func BuildWorkerWdPoStTaskManager(kv WorkerProverStore) (core.WorkerWdPoStTaskManager, error) { + wdpostKV, err := kvstore.NewWrappedKVStore([]byte("wdpost-"), kv) + if err != nil { + return nil, err + } + return proverworker.NewKVTaskManager(*kvstore.NewKVExt(wdpostKV)), nil +} diff --git a/damocles-manager/dep/sealer.go b/damocles-manager/dep/sealer.go index 029bf19a1..8ca2efc47 100644 --- a/damocles-manager/dep/sealer.go +++ b/damocles-manager/dep/sealer.go @@ -11,13 +11,13 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/modules" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/mock" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover" - proverworker "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover/worker" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/randomness" "github.com/ipfs-force-community/damocles/damocles-manager/modules/sealer" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/chain" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/confmgr" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/market" messager "github.com/ipfs-force-community/damocles/damocles-manager/pkg/messager" + "github.com/ipfs-force-community/damocles/damocles-manager/ver" ) type GlobalContext context.Context @@ -36,6 +36,7 @@ func MockSealer(s ...interface{}) dix.Option { return dix.Options( dix.Override(new(*mock.Sealer), mock.NewSealer), dix.Override(new(core.SealerAPI), dix.From(new(*mock.Sealer))), + dix.Override(new(core.SealerCliAPI), dix.From(new(*mock.Sealer))), dix.Populate(InvokePopulate, s...), ) } @@ -56,9 +57,8 @@ func Product() dix.Option { dix.Override(new(core.SectorNumberAllocator), BuildSectorNumberAllocator), dix.Override(new(core.RandomnessAPI), randomness.New), dix.Override(new(core.SectorTracker), BuildSectorTracker), - dix.Override(new(core.Prover), prover.Prover), - dix.Override(new(core.Verifier), prover.Verifier), - dix.Override(new(core.MinerAPI), BuildMinerAPI), + dix.If(ver.ProverIsProd(), prodProver()), + dix.If(!ver.ProverIsProd(), fakerProver()), dix.Override(new(core.CommitmentManager), BuildCommitmentManager), dix.Override(new(messager.API), BuildMessagerClient), @@ -80,10 +80,20 @@ func Product() dix.Option { dix.Override(new(OfflineMetaStore), BuildOfflineMetaStore), dix.Override(new(WorkerMetaStore), BuildWorkerMetaStore), dix.Override(new(CommonMetaStore), BuildCommonMetaStore), - dix.Override(new(WorkerProverStore), BuildWorkerProverStore), + ) +} - dix.Override(new(core.WorkerWdPoStTaskManager), BuildWorkerWdPoStTaskManager), - dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewWdPoStAPIImpl), +func fakerProver() dix.Option { + return dix.Options( + dix.Override(new(core.Prover), prover.NewFakeProver), + dix.Override(new(core.Verifier), prover.NewFakeVerifier), + ) +} + +func prodProver() dix.Option { + return dix.Options( + dix.Override(new(core.Prover), prover.NewProdProver), + dix.Override(new(core.Verifier), prover.NewProdVerifier), ) } @@ -94,8 +104,8 @@ type ProxyOptions struct { func Proxy(dest string, opt ProxyOptions) dix.Option { return dix.Options( dix.Override(new(ProxyAddress), ProxyAddress(dest)), - dix.Override(new(core.APIClient), BuildAPIProxyClient), - dix.Override(new(core.SealerCliAPIClient), BuildSealerCliAPIClient), + dix.Override(new(*core.APIClient), BuildAPIProxyClient), + dix.Override(new(*core.SealerCliAPIClient), dix.From(new(*core.APIClient))), dix.If(opt.EnableSectorIndexer, dix.Override(new(core.SectorIndexer), BuildProxiedSectorIndex), ), @@ -106,6 +116,7 @@ func Sealer(target ...interface{}) dix.Option { return dix.Options( dix.Override(new(*sealer.Sealer), sealer.New), dix.Override(new(core.SealerAPI), dix.From(new(*sealer.Sealer))), + dix.Override(new(core.SealerCliAPI), dix.From(new(*sealer.Sealer))), dix.If(len(target) > 0, dix.Populate(InvokePopulate, target...)), ) } @@ -122,8 +133,8 @@ func APIClient(target ...interface{}) dix.Option { dix.Override(new(chain.API), BuildChainClient), dix.Override(new(messager.API), BuildMessagerClient), dix.Override(new(market.API), BuildMarketAPI), - dix.Override(new(core.APIClient), MaybeAPIClient), - dix.Override(new(core.SealerCliAPIClient), BuildSealerCliAPIClient), + dix.Override(new(*core.APIClient), MaybeAPIClient), + dix.Override(new(*core.SealerCliAPIClient), dix.From(new(*core.APIClient))), dix.If(len(target) > 0, dix.Populate(InvokePopulate, target...)), ) } diff --git a/damocles-manager/dep/sealer_constructor.go b/damocles-manager/dep/sealer_constructor.go index 099cac31d..93caf207f 100644 --- a/damocles-manager/dep/sealer_constructor.go +++ b/damocles-manager/dep/sealer_constructor.go @@ -19,7 +19,6 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/commitmgr" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/dealmgr" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/mock" - proverworker "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover/worker" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/sectors" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/worker" "github.com/ipfs-force-community/damocles/damocles-manager/modules/policy" @@ -47,7 +46,6 @@ type ( WorkerMetaStore kvstore.KVStore ConfDirPath string CommonMetaStore kvstore.KVStore - WorkerProverStore kvstore.KVStore ) func BuildLocalSectorManager(scfg *modules.SafeConfig, mapi core.MinerAPI, numAlloc core.SectorNumberAllocator) (core.SectorManager, error) { @@ -312,10 +310,6 @@ func BuildAPIProxyClient(gctx GlobalContext, lc fx.Lifecycle, proxy ProxyAddress return &proxyClient, err } -func BuildSealerCliAPIClient(client *core.APIClient) *core.SealerCliAPIClient { - return &client.SealerCliAPIClient -} - func buildDamoclesAPIClient(gctx GlobalContext, lc fx.Lifecycle, namespace string, out interface{}, serverAddr string, useHTTP bool) error { addr, err := net.ResolveTCPAddr("tcp", serverAddr) if err != nil { @@ -697,15 +691,3 @@ func BuildUnsealManager( } return mgr, nil } - -func BuildWorkerProverStore(gctx GlobalContext, db UnderlyingDB) (WorkerProverStore, error) { - return db.OpenCollection(gctx, "prover") -} - -func BuildWorkerWdPoStTaskManager(kv WorkerProverStore) (core.WorkerWdPoStTaskManager, error) { - wdpostKV, err := kvstore.NewWrappedKVStore([]byte("wdpost-"), kv) - if err != nil { - return nil, err - } - return proverworker.NewKVTaskManager(*kvstore.NewKVExt(wdpostKV)), nil -} diff --git a/damocles-manager/modules/impl/prover/ext/prover.go b/damocles-manager/modules/impl/prover/ext/prover.go index 28d6f11de..7a1b26529 100644 --- a/damocles-manager/modules/impl/prover/ext/prover.go +++ b/damocles-manager/modules/impl/prover/ext/prover.go @@ -19,7 +19,7 @@ var log = logging.New("ext-prover") var _ core.Prover = (*Prover)(nil) -func New(ctx context.Context, windowCfgs []extproc.ExtProcessorConfig, winningCfgs []extproc.ExtProcessorConfig) (*Prover, error) { +func New(ctx context.Context, sectorTracker core.SectorTracker, windowCfgs []extproc.ExtProcessorConfig, winningCfgs []extproc.ExtProcessorConfig) (*Prover, error) { var windowProc, winningPorc *extproc.Processor var err error if len(windowCfgs) > 0 { @@ -45,14 +45,18 @@ func New(ctx context.Context, windowCfgs []extproc.ExtProcessorConfig, winningCf } return &Prover{ - windowProc: windowProc, - winningPorc: winningPorc, + sectorTracker: sectorTracker, + localProver: prover.NewProdProver(sectorTracker), + windowProc: windowProc, + winningPorc: winningPorc, }, nil } type Prover struct { - windowProc *extproc.Processor - winningPorc *extproc.Processor + sectorTracker core.SectorTracker + localProver core.Prover + windowProc *extproc.Processor + winningPorc *extproc.Processor } func (p *Prover) Run() { @@ -75,47 +79,102 @@ func (p *Prover) Close() { } } -func (*Prover) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { - return prover.Prover.AggregateSealProofs(ctx, aggregateInfo, proofs) +func (p *Prover) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { + return p.localProver.AggregateSealProofs(ctx, aggregateInfo, proofs) } -func (p *Prover) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectors prover.SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, []abi.SectorID, error) { +func (p *Prover) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, []abi.SectorID, error) { if p.windowProc == nil { - return prover.Prover.GenerateWindowPoSt(ctx, minerID, sectors, randomness) + return p.localProver.GenerateWindowPoSt(ctx, deadlineIdx, minerID, proofType, sectors, randomness) } - return prover.ExtGenerateWindowPoSt(minerID, sectors, randomness)(func(data stage.WindowPoSt) (stage.WindowPoStOutput, error) { - var res stage.WindowPoStOutput - err := p.windowProc.Process(ctx, data, &res) + if len(sectors) == 0 { + return nil, nil, nil + } + privSectors, err := p.sectorTracker.PubToPrivate(ctx, minerID, proofType, sectors) + if err != nil { + return nil, nil, fmt.Errorf("turn public sector infos into private: %w", err) + } + + data := stage.WindowPoSt{ + MinerID: minerID, + ProofType: stage.ProofType2String(proofType), + } + copy(data.Seed[:], randomness[:]) + + for i := range privSectors { + inner := privSectors[i] + + if pt := inner.PoStProofType; pt != proofType { + return nil, nil, fmt.Errorf("proof type not match for sector %d of miner %d: want %s, got %s", inner.SectorNumber, minerID, stage.ProofType2String(proofType), stage.ProofType2String(pt)) + } + + commR, err := util.CID2ReplicaCommitment(inner.SealedCID) if err != nil { - return res, fmt.Errorf("WindowPoStProcessor.Process: %w", err) + return nil, nil, fmt.Errorf("invalid selaed cid %s for sector %d of miner %d: %w", inner.SealedCID, inner.SectorNumber, minerID, err) + } + + data.Replicas = append(data.Replicas, stage.PoStReplicaInfo{ + SectorID: inner.SectorNumber, + CommR: commR, + CacheDir: inner.CacheDirPath, + SealedFile: inner.SealedSectorPath, + }) + } + + var res stage.WindowPoStOutput + + err = p.windowProc.Process(ctx, data, &res) + if err != nil { + return nil, nil, fmt.Errorf("WindowPoStProcessor.Process: %w", err) + } + + if faultCount := len(res.Faults); faultCount != 0 { + faults := make([]abi.SectorID, faultCount) + for fi := range res.Faults { + faults[fi] = abi.SectorID{ + Miner: minerID, + Number: res.Faults[fi], + } } - return res, nil - }) + return nil, faults, fmt.Errorf("got %d fault sectors", faultCount) + } + + proofs := make([]builtin.PoStProof, len(res.Proofs)) + for pi := range res.Proofs { + proofs[pi] = builtin.PoStProof{ + PoStProof: proofType, + ProofBytes: res.Proofs[pi], + } + } + return proofs, nil, nil } -func (p *Prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectors prover.SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { +func (p *Prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { randomness[31] &= 0x3f if p.winningPorc == nil { - return prover.Prover.GenerateWinningPoSt(ctx, minerID, sectors, randomness) + return p.localProver.GenerateWinningPoSt(ctx, minerID, proofType, sectors, randomness) } - sectorInners := sectors.Values() - if len(sectorInners) == 0 { + if len(sectors) == 0 { return nil, nil } - proofType := sectorInners[0].PoStProofType data := stage.WinningPost{ MinerID: minerID, ProofType: stage.ProofType2String(proofType), } copy(data.Seed[:], randomness[:]) - for i := range sectorInners { - inner := sectorInners[i] + privSectors, err := p.sectorTracker.PubToPrivate(ctx, minerID, proofType, sectors) + if err != nil { + return nil, fmt.Errorf("turn public sector infos into private: %w", err) + } + + for i := range privSectors { + inner := privSectors[i] if pt := inner.PoStProofType; pt != proofType { return nil, fmt.Errorf("proof type not match for sector %d of miner %d: want %s, got %s", inner.SectorNumber, minerID, stage.ProofType2String(proofType), stage.ProofType2String(pt)) @@ -123,7 +182,7 @@ func (p *Prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, s commR, err := util.CID2ReplicaCommitment(inner.SealedCID) if err != nil { - return nil, fmt.Errorf("invalid selaed cid %s for sector %d of miner %d: %w", inner.SealedCID, inner.SectorNumber, minerID, err) + return nil, fmt.Errorf("invalid sealed cid %s for sector %d of miner %d: %w", inner.SealedCID, inner.SectorNumber, minerID, err) } data.Replicas = append(data.Replicas, stage.PoStReplicaInfo{ @@ -136,7 +195,7 @@ func (p *Prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, s var res stage.WinningPoStOutput - err := p.winningPorc.Process(ctx, data, &res) + err = p.winningPorc.Process(ctx, data, &res) if err != nil { return nil, fmt.Errorf("WinningPoStProcessor.Process: %w", err) } @@ -152,16 +211,16 @@ func (p *Prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, s return proofs, nil } -func (*Prover) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { +func (p *Prover) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { randomness[31] &= 0x3f - return prover.Prover.GeneratePoStFallbackSectorChallenges(ctx, proofType, minerID, randomness, sectorIds) + return p.localProver.GeneratePoStFallbackSectorChallenges(ctx, proofType, minerID, randomness, sectorIds) } -func (*Prover) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { - return prover.Prover.GenerateSingleVanillaProof(ctx, replica, challenges) +func (p *Prover) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { + return p.localProver.GenerateSingleVanillaProof(ctx, replica, challenges) } -func (*Prover) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { +func (p *Prover) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { randomness[31] &= 0x3f - return prover.Prover.GenerateWinningPoStWithVanilla(ctx, proofType, minerID, randomness, proofs) + return p.localProver.GenerateWinningPoStWithVanilla(ctx, proofType, minerID, randomness, proofs) } diff --git a/damocles-manager/modules/impl/prover/prover.go b/damocles-manager/modules/impl/prover/prover.go deleted file mode 100644 index 09ced102e..000000000 --- a/damocles-manager/modules/impl/prover/prover.go +++ /dev/null @@ -1,87 +0,0 @@ -package prover - -import ( - "fmt" - - "github.com/filecoin-project/go-state-types/abi" - "github.com/filecoin-project/venus/venus-shared/actors/builtin" - "github.com/ipfs-force-community/damocles/damocles-manager/core" - "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" - "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" -) - -var _ core.Prover = Prover -var _ core.Verifier = Verifier - -var Verifier verifier -var Prover prover - -type ( - SortedPrivateSectorInfo = core.SortedPrivateSectorInfo -) - -type ExtDoWindowPoStFunc func(stage.WindowPoSt) (stage.WindowPoStOutput, error) - -func ExtGenerateWindowPoSt(minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) func(ExtDoWindowPoStFunc) ([]builtin.PoStProof, []abi.SectorID, error) { - randomness[31] &= 0x3f - return func(doWork ExtDoWindowPoStFunc) ([]builtin.PoStProof, []abi.SectorID, error) { - sectorInners := sectors.Values() - if len(sectorInners) == 0 { - return nil, nil, nil - } - - // build stage.WindowPoSt - proofType := sectorInners[0].PoStProofType - data := stage.WindowPoSt{ - MinerID: minerID, - ProofType: stage.ProofType2String(proofType), - } - copy(data.Seed[:], randomness[:]) - - for i := range sectorInners { - inner := sectorInners[i] - - if pt := inner.PoStProofType; pt != proofType { - return nil, nil, fmt.Errorf("proof type not match for sector %d of miner %d: want %s, got %s", inner.SectorNumber, minerID, stage.ProofType2String(proofType), stage.ProofType2String(pt)) - } - - commR, err := util.CID2ReplicaCommitment(inner.SealedCID) - if err != nil { - return nil, nil, fmt.Errorf("invalid selaed cid %s for sector %d of miner %d: %w", inner.SealedCID, inner.SectorNumber, minerID, err) - } - - data.Replicas = append(data.Replicas, stage.PoStReplicaInfo{ - SectorID: inner.SectorNumber, - CommR: commR, - CacheDir: inner.CacheDirPath, - SealedFile: inner.SealedSectorPath, - }) - } - - output, err := doWork(data) - if err != nil { - return nil, nil, err - } - - if faultCount := len(output.Faults); faultCount != 0 { - faults := make([]abi.SectorID, faultCount) - for fi := range output.Faults { - faults[fi] = abi.SectorID{ - Miner: minerID, - Number: output.Faults[fi], - } - } - - return nil, faults, fmt.Errorf("got %d fault sectors", faultCount) - } - - proofs := make([]builtin.PoStProof, len(output.Proofs)) - for pi := range output.Proofs { - proofs[pi] = builtin.PoStProof{ - PoStProof: proofType, - ProofBytes: output.Proofs[pi], - } - } - return proofs, nil, nil - } -} diff --git a/damocles-manager/modules/impl/prover/prover_fake.go b/damocles-manager/modules/impl/prover/prover_fake.go index ff8e170e3..308fcf2ad 100644 --- a/damocles-manager/modules/impl/prover/prover_fake.go +++ b/damocles-manager/modules/impl/prover/prover_fake.go @@ -1,6 +1,3 @@ -//go:build !prod -// +build !prod - package prover import ( @@ -13,48 +10,56 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/core" ) -type verifier struct { +func NewFakeVerifier() core.Verifier { + return &fakeVerifier{} +} + +type fakeVerifier struct { } -func (verifier) VerifySeal(context.Context, core.SealVerifyInfo) (bool, error) { +func (fakeVerifier) VerifySeal(context.Context, core.SealVerifyInfo) (bool, error) { return false, nil } -func (verifier) VerifyAggregateSeals(context.Context, core.AggregateSealVerifyProofAndInfos) (bool, error) { +func (fakeVerifier) VerifyAggregateSeals(context.Context, core.AggregateSealVerifyProofAndInfos) (bool, error) { return false, nil } -func (verifier) VerifyWindowPoSt(ctx context.Context, info core.WindowPoStVerifyInfo) (bool, error) { +func (fakeVerifier) VerifyWindowPoSt(ctx context.Context, info core.WindowPoStVerifyInfo) (bool, error) { return false, nil } -func (verifier) VerifyWinningPoSt(ctx context.Context, info core.WinningPoStVerifyInfo) (bool, error) { +func (fakeVerifier) VerifyWinningPoSt(ctx context.Context, info core.WinningPoStVerifyInfo) (bool, error) { return false, nil } -type prover struct { +func NewFakeProver() core.Prover { + return &fakeProver{} +} + +type fakeProver struct { } -func (prover) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { +func (fakeProver) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { return make([]byte, 32), nil } -func (prover) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { +func (fakeProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { return nil, nil, nil } -func (prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { +func (fakeProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { return nil, nil } -func (prover) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { +func (fakeProver) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { return nil, nil } -func (prover) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { +func (fakeProver) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { return nil, nil } -func (prover) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { +func (fakeProver) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { return nil, nil } diff --git a/damocles-manager/modules/impl/prover/prover_prod.go b/damocles-manager/modules/impl/prover/prover_prod.go index 742f70326..8d79c1c4e 100644 --- a/damocles-manager/modules/impl/prover/prover_prod.go +++ b/damocles-manager/modules/impl/prover/prover_prod.go @@ -1,6 +1,3 @@ -//go:build prod -// +build prod - package prover import ( @@ -19,37 +16,54 @@ import ( var log = logging.New("prover") -type verifier struct { +func NewProdVerifier() core.Verifier { + return &prodVerifier{} +} + +type prodVerifier struct { } -func (verifier) VerifySeal(ctx context.Context, svi core.SealVerifyInfo) (bool, error) { +func (prodVerifier) VerifySeal(ctx context.Context, svi core.SealVerifyInfo) (bool, error) { return ffi.VerifySeal(svi) } -func (verifier) VerifyAggregateSeals(ctx context.Context, aggregate core.AggregateSealVerifyProofAndInfos) (bool, error) { +func (prodVerifier) VerifyAggregateSeals(ctx context.Context, aggregate core.AggregateSealVerifyProofAndInfos) (bool, error) { return ffi.VerifyAggregateSeals(aggregate) } -func (verifier) VerifyWindowPoSt(ctx context.Context, info core.WindowPoStVerifyInfo) (bool, error) { +func (prodVerifier) VerifyWindowPoSt(ctx context.Context, info core.WindowPoStVerifyInfo) (bool, error) { info.Randomness[31] &= 0x3f return ffi.VerifyWindowPoSt(info) } -func (verifier) VerifyWinningPoSt(ctx context.Context, info core.WinningPoStVerifyInfo) (bool, error) { +func (prodVerifier) VerifyWinningPoSt(ctx context.Context, info core.WinningPoStVerifyInfo) (bool, error) { info.Randomness[31] &= 0x3f return ffi.VerifyWinningPoSt(info) } -type prover struct { +func NewProdProver(sectorTracker core.SectorTracker) core.Prover { + return &prodProver{ + sectorTracker: sectorTracker, + } +} + +type prodProver struct { + sectorTracker core.SectorTracker } -func (prover) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { +func (prodProver) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { return ffi.AggregateSealProofs(aggregateInfo, proofs) } -func (prover) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { +func (p prodProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, ppt abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { randomness[31] &= 0x3f - proof, faulty, err := ffi.GenerateWindowPoSt(minerID, sectors, randomness) + + privSectors, err := p.sectorTracker.PubToPrivate(ctx, minerID, ppt, sectors) + if err != nil { + return nil, nil, fmt.Errorf("turn public sector infos into private: %w", err) + } + + proof, faulty, err := ffi.GenerateWindowPoSt(minerID, core.NewSortedPrivateSectorInfo(privSectors...), randomness) var faultyIDs []abi.SectorID for _, f := range faulty { @@ -62,18 +76,23 @@ func (prover) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, secto return proof, faultyIDs, err } -func (prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectors SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { +func (p prodProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, ppt abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { randomness[31] &= 0x3f - return ffi.GenerateWinningPoSt(minerID, sectors, randomness) + privSectors, err := p.sectorTracker.PubToPrivate(ctx, minerID, ppt, sectors) + if err != nil { + return nil, fmt.Errorf("turn public sector infos into private: %w", err) + } + + return ffi.GenerateWinningPoSt(minerID, core.NewSortedPrivateSectorInfo(privSectors...), randomness) } -func (prover) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { +func (prodProver) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { randomness[31] &= 0x3f return ffi.GeneratePoStFallbackSectorChallenges(proofType, minerID, randomness, sectorIds) } -func (prover) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { +func (prodProver) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { start := time.Now() resCh := make(chan core.Result[[]byte], 1) @@ -92,7 +111,7 @@ func (prover) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPr } } -func (prover) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { +func (prodProver) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { randomness[31] &= 0x3f return ffi.GenerateWinningPoStWithVanilla(proofType, minerID, randomness, proofs) } diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go index 415f1b6df..f1f01b9b0 100644 --- a/damocles-manager/modules/impl/prover/worker/prover.go +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -8,11 +8,12 @@ import ( "sync" "time" - "github.com/cespare/xxhash" + "github.com/cespare/xxhash/v2" "github.com/filecoin-project/go-state-types/abi" "github.com/filecoin-project/lotus/chain/actors/builtin" "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover" + "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/logging" ) @@ -26,7 +27,9 @@ func GenTaskID(rawInput []byte) string { } type workerProver struct { - taskMgr core.WorkerWdPoStTaskManager + taskMgr core.WorkerWdPoStTaskManager + sectorTracker core.SectorTracker + localProver core.Prover inflightTasks map[string][]chan<- struct { output *stage.WindowPoStOutput @@ -42,9 +45,11 @@ type workerProver struct { taskLifetime time.Duration } -func NewProver(taskMgr core.WorkerWdPoStTaskManager) core.Prover { +func NewProver(taskMgr core.WorkerWdPoStTaskManager, sectorTracker core.SectorTracker) core.Prover { return &workerProver{ - taskMgr: taskMgr, + taskMgr: taskMgr, + sectorTracker: sectorTracker, + localProver: prover.NewProdProver(sectorTracker), inflightTasks: make(map[string][]chan<- struct { output *stage.WindowPoStOutput err string @@ -141,49 +146,95 @@ func (p *workerProver) runCleanupExpiredTasksJob(ctx context.Context) { } func (p *workerProver) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { - return prover.Prover.AggregateSealProofs(ctx, aggregateInfo, proofs) + return p.localProver.AggregateSealProofs(ctx, aggregateInfo, proofs) } -func (p *workerProver) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectors core.SortedPrivateSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { +func (p *workerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { - return prover.ExtGenerateWindowPoSt(minerID, sectors, randomness)(func(input stage.WindowPoSt) (stage.WindowPoStOutput, error) { - task, err := p.taskMgr.Create(ctx, input) + sis := make([]core.WdPoStSectorInfo, len(sectors)) + for i, s := range sectors { + privInfo, err := p.sectorTracker.SinglePubToPrivateInfo(ctx, minerID, s, nil) if err != nil { - return stage.WindowPoStOutput{}, fmt.Errorf("create wdPoSt task: %w", err) + return nil, nil, fmt.Errorf("construct private info for %d: %w", s.SectorNumber, err) + } + commR, err := util.CID2ReplicaCommitment(s.SealedCID) + if err != nil { + return nil, nil, fmt.Errorf("invalid sealed cid %s for sector %d of miner %d: %w", s.SealedCID, s.SectorNumber, minerID, err) } - ch := make(chan struct { - output *stage.WindowPoStOutput - err string - }, 1) + sis[i] = core.WdPoStSectorInfo{ + SectorID: s.SectorNumber, + CommR: commR, + Upgrade: s.SectorKey != nil, + Accesses: privInfo.Accesses, + } + } - p.inflightTasksLock.Lock() - p.inflightTasks[task.ID] = append(p.inflightTasks[task.ID], ch) - p.inflightTasksLock.Unlock() + input := core.WdPoStInput{ + MinerID: minerID, + ProofType: proofType, + Sectors: sis, + } + copy(input.Seed[:], randomness[:]) + + task, err := p.taskMgr.Create(ctx, input) + if err != nil { + return nil, nil, fmt.Errorf("create wdPoSt task: %w", err) + } + + ch := make(chan struct { + output *stage.WindowPoStOutput + err string + }, 1) - result, ok := <-ch - if !ok { - return stage.WindowPoStOutput{}, fmt.Errorf("wdPoSt result channel was closed unexpectedly") + p.inflightTasksLock.Lock() + p.inflightTasks[task.ID] = append(p.inflightTasks[task.ID], ch) + p.inflightTasksLock.Unlock() + + result, ok := <-ch + + if !ok { + return nil, nil, fmt.Errorf("wdPoSt result channel was closed unexpectedly") + } + if result.err != "" { + return nil, nil, fmt.Errorf("error from worker: %s", result.err) + } + + if faultCount := len(result.output.Faults); faultCount != 0 { + faults := make([]abi.SectorID, faultCount) + for fi := range result.output.Faults { + faults[fi] = abi.SectorID{ + Miner: minerID, + Number: result.output.Faults[fi], + } } - if result.err != "" { - return stage.WindowPoStOutput{}, fmt.Errorf("error from worker: %s", result.err) + + return nil, faults, fmt.Errorf("got %d fault sectors", faultCount) + } + + proofs := make([]builtin.PoStProof, len(result.output.Proofs)) + for pi := range result.output.Proofs { + proofs[pi] = builtin.PoStProof{ + PoStProof: proofType, + ProofBytes: result.output.Proofs[pi], } - return *result.output, nil - }) + } + + return proofs, nil, nil } -func (p *workerProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectors core.SortedPrivateSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { - return prover.Prover.GenerateWinningPoSt(ctx, minerID, sectors, randomness) +func (p *workerProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { + return p.localProver.GenerateWinningPoSt(ctx, minerID, proofType, sectors, randomness) } func (p *workerProver) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { - return prover.Prover.GeneratePoStFallbackSectorChallenges(ctx, proofType, minerID, randomness, sectorIds) + return p.localProver.GeneratePoStFallbackSectorChallenges(ctx, proofType, minerID, randomness, sectorIds) } func (p *workerProver) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { - return prover.Prover.GenerateSingleVanillaProof(ctx, replica, challenges) + return p.localProver.GenerateSingleVanillaProof(ctx, replica, challenges) } func (p *workerProver) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { - return prover.Prover.GenerateWinningPoStWithVanilla(ctx, proofType, minerID, randomness, proofs) + return p.localProver.GenerateWinningPoStWithVanilla(ctx, proofType, minerID, randomness, proofs) } diff --git a/damocles-manager/modules/impl/prover/worker/rpc.go b/damocles-manager/modules/impl/prover/worker/rpc.go index 3a5b09d42..e70d618e9 100644 --- a/damocles-manager/modules/impl/prover/worker/rpc.go +++ b/damocles-manager/modules/impl/prover/worker/rpc.go @@ -2,6 +2,7 @@ package worker import ( "context" + "fmt" "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" @@ -21,8 +22,8 @@ func (api WdPoStAPIImpl) WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs return api.taskMgr.Heartbeat(ctx, runningTaskIDs, workerName) } -func (api WdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, num uint32, workName string) (allocatedTasks []core.WdPoStAllocatedTask, err error) { - return api.taskMgr.AllocateTasks(ctx, num, workName) +func (api WdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*core.WdPoStAllocatedTask, err error) { + return api.taskMgr.AllocateTasks(ctx, spec, num, workerName) } func (api WdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { @@ -36,3 +37,26 @@ func (api WdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) err func (api WdPoStAPIImpl) WdPoStAllTasks(ctx context.Context) ([]*core.WdPoStTask, error) { return api.taskMgr.All(ctx, func(_ *core.WdPoStTask) bool { return true }) } + +// TODO(0x5459): UnavailableWdPoStAPIImpl should be automatically generated +type UnavailableWdPoStAPIImpl struct{} + +func (UnavailableWdPoStAPIImpl) WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs []string, workerName string) error { + return fmt.Errorf("WdPoStAPI unavailable") +} + +func (UnavailableWdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []core.WdPoStAllocatedTask, err error) { + return nil, fmt.Errorf("WdPoStAPI unavailable") +} + +func (UnavailableWdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { + return fmt.Errorf("WdPoStAPI unavailable") +} + +func (UnavailableWdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) error { + return fmt.Errorf("WdPoStAPI unavailable") +} + +func (UnavailableWdPoStAPIImpl) WdPoStAllTasks(ctx context.Context) ([]*core.WdPoStTask, error) { + return nil, fmt.Errorf("WdPoStAPI unavailable") +} diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go index 98480ba3e..1810e8cd2 100644 --- a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go +++ b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go @@ -12,6 +12,7 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" + "golang.org/x/exp/slices" ) func NewKVTaskManager(kv kvstore.KVExt) core.WorkerWdPoStTaskManager { @@ -45,6 +46,7 @@ func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state c } func (tm *kvTaskManager) All(ctx context.Context, filter func(*core.WdPoStTask) bool) (tasks []*core.WdPoStTask, err error) { + tasks = make([]*core.WdPoStTask, 0) err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { for _, state := range []core.WdPoStTaskState{core.WdPoStTaskReadyToRun, core.WdPoStTaskRunning, core.WdPoStTaskFinished} { ts, err := tm.filter(ctx, txn, state, math.MaxUint32, filter) @@ -77,7 +79,7 @@ func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state core.WdPoStTas return tasks, err } -func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*core.WdPoStTask, error) { +func (tm *kvTaskManager) Create(ctx context.Context, input core.WdPoStInput) (*core.WdPoStTask, error) { var ( taskID string task *core.WdPoStTask @@ -126,10 +128,19 @@ func (tm *kvTaskManager) Create(ctx context.Context, input stage.WindowPoSt) (*c return task, err } -func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName string) (allocatedTasks []core.WdPoStAllocatedTask, err error) { +func (tm *kvTaskManager) AllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, n uint32, workerName string) (allocatedTasks []*core.WdPoStAllocatedTask, err error) { var readyToRun []*core.WdPoStTask + allocatedTasks = make([]*core.WdPoStAllocatedTask, 0) err = tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - readyToRun, err = tm.filter(ctx, txn, core.WdPoStTaskReadyToRun, n, func(_ *core.WdPoStTask) bool { return true }) + readyToRun, err = tm.filter(ctx, txn, core.WdPoStTaskReadyToRun, n, func(t *core.WdPoStTask) bool { + if len(spec.AllowedMiners) > 0 && !slices.Contains(spec.AllowedMiners, t.Input.MinerID) { + return false + } + if len(spec.AllowedProofTypes) > 0 && !slices.Contains(spec.AllowedProofTypes, t.Input.ProofType) { + return false + } + return true + }) if err != nil { return err } @@ -137,7 +148,7 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName s for _, task := range readyToRun { task.TryNum++ task.StartedAt = now - task.WorkerName = workName + task.WorkerName = workerName task.HeartbeatAt = now task.UpdatedAt = now // Moving ready to run tasks to running tasks @@ -147,7 +158,7 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, n uint32, workName s if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskRunning, task.ID)), task); err != nil { return err } - allocatedTasks = append(allocatedTasks, core.WdPoStAllocatedTask{ + allocatedTasks = append(allocatedTasks, &core.WdPoStAllocatedTask{ ID: task.ID, Input: task.Input, }) diff --git a/damocles-manager/modules/miner/proof_event.go b/damocles-manager/modules/miner/proof_event.go index 068825d87..b68126e85 100644 --- a/damocles-manager/modules/miner/proof_event.go +++ b/damocles-manager/modules/miner/proof_event.go @@ -6,6 +6,7 @@ import ( "fmt" "time" + "github.com/filecoin-project/go-state-types/abi" "github.com/filecoin-project/venus/venus-shared/actors/builtin" v2 "github.com/filecoin-project/venus/venus-shared/api/gateway/v2" vtypes "github.com/filecoin-project/venus/venus-shared/types" @@ -98,7 +99,7 @@ func (pe *ProofEvent) listenProofRequestOnce(ctx context.Context) error { // context.Context, []builtin.ExtendedSectorInfo, abi.PoStRandomness, abi.ChainEpoch, network.Version func (pe *ProofEvent) processComputeProof(ctx context.Context, reqID vtypes.UUID, req gtypes.ComputeProofRequest) { - privSectors, err := pe.sectorsPubToPrivate(ctx, req.SectorInfos) + ppt, err := pe.postProofType(req.SectorInfos) if err != nil { _ = pe.client.ResponseProofEvent(ctx, >ypes.ResponseEvent{ ID: reqID, @@ -108,7 +109,7 @@ func (pe *ProofEvent) processComputeProof(ctx context.Context, reqID vtypes.UUID return } - proof, err := pe.prover.GenerateWinningPoSt(ctx, pe.actor.ID, privSectors, req.Rand) + proof, err := pe.prover.GenerateWinningPoSt(ctx, pe.actor.ID, ppt, req.SectorInfos, req.Rand) if err != nil { _ = pe.client.ResponseProofEvent(ctx, >ypes.ResponseEvent{ ID: reqID, @@ -138,18 +139,14 @@ func (pe *ProofEvent) processComputeProof(ctx context.Context, reqID vtypes.UUID } } -func (pe *ProofEvent) sectorsPubToPrivate(ctx context.Context, sectorInfo []builtin.ExtendedSectorInfo) (core.SortedPrivateSectorInfo, error) { +func (pe *ProofEvent) postProofType(sectorInfo []builtin.ExtendedSectorInfo) (abi.RegisteredPoStProof, error) { if len(sectorInfo) == 0 { - return core.SortedPrivateSectorInfo{}, fmt.Errorf("must provide sectors for winning post") + return 0, fmt.Errorf("must provide sectors for winning post") } ppt, err := sectorInfo[0].SealProof.RegisteredWinningPoStProof() if err != nil { - return core.SortedPrivateSectorInfo{}, fmt.Errorf("failed to convert to winning post proof: %w", err) - } - out, err := pe.tracker.PubToPrivate(ctx, pe.actor.ID, ppt, sectorInfo) - if err != nil { - return core.SortedPrivateSectorInfo{}, fmt.Errorf("convert to private infos: %w", err) + return 0, fmt.Errorf("failed to convert to winning post proof: %w", err) } - return core.NewSortedPrivateSectorInfo(out...), nil + return ppt, nil } diff --git a/damocles-manager/modules/poster/runner.go b/damocles-manager/modules/poster/runner.go index 38c4ad5cc..1c7c5d27e 100644 --- a/damocles-manager/modules/poster/runner.go +++ b/damocles-manager/modules/poster/runner.go @@ -323,12 +323,8 @@ func (pr *postRunner) generatePoStForPartitionBatch(glog *logging.ZapLogger, ran if err != nil { return false, fmt.Errorf("convert to v1_1 post proof: %w", err) } - privSectors, err := pr.deps.sectorTracker.PubToPrivate(pr.ctx, pr.mid, pp, xsinfos) - if err != nil { - return true, fmt.Errorf("turn public sector infos into private: %w", err) - } - postOut, ps, err := pr.deps.prover.GenerateWindowPoSt(pr.ctx, pr.mid, core.NewSortedPrivateSectorInfo(privSectors...), append(abi.PoStRandomness{}, rand.Rand...)) + postOut, ps, err := pr.deps.prover.GenerateWindowPoSt(pr.ctx, pr.dinfo.Index, pr.mid, pp, xsinfos, append(abi.PoStRandomness{}, rand.Rand...)) alog.Infow("computing window post", "elapsed", time.Since(tsStart)) diff --git a/damocles-manager/modules/sealer/sealer_cli.go b/damocles-manager/modules/sealer/sealer_cli.go index c3d888403..880cbfb47 100644 --- a/damocles-manager/modules/sealer/sealer_cli.go +++ b/damocles-manager/modules/sealer/sealer_cli.go @@ -60,18 +60,13 @@ func (s *Sealer) CheckProvable(ctx context.Context, mid abi.ActorID, postProofTy return s.sectorTracker.Provable(ctx, mid, postProofType, sectors, strict, stateCheck) } -func (s *Sealer) SimulateWdPoSt(ctx context.Context, maddr address.Address, postProofType abi.RegisteredPoStProof, sis []builtin.ExtendedSectorInfo, rand abi.PoStRandomness) error { +func (s *Sealer) SimulateWdPoSt(ctx context.Context, ddlIndex uint64, maddr address.Address, postProofType abi.RegisteredPoStProof, sis []builtin.ExtendedSectorInfo, rand abi.PoStRandomness) error { mid, err := address.IDFromAddress(maddr) if err != nil { return err } - privSectors, err := s.sectorTracker.PubToPrivate(ctx, abi.ActorID(mid), postProofType, sis) - if err != nil { - return fmt.Errorf("turn public sector infos into private: %w", err) - } - - slog := log.With("miner", mid, "sectors", len(privSectors)) + slog := log.With("miner", mid, "sectors", len(sis)) go func() { tCtx := context.TODO() @@ -79,7 +74,7 @@ func (s *Sealer) SimulateWdPoSt(ctx context.Context, maddr address.Address, post tsStart := clock.NewSystemClock().Now() slog.Info("mock generate window post start") - proof, skipped, err := s.prover.GenerateWindowPoSt(tCtx, abi.ActorID(mid), core.NewSortedPrivateSectorInfo(privSectors...), append(abi.PoStRandomness{}, rand...)) + proof, skipped, err := s.prover.GenerateWindowPoSt(tCtx, ddlIndex, abi.ActorID(mid), postProofType, sis, append(abi.PoStRandomness{}, rand...)) if err != nil { slog.Warnf("generate window post failed: %v", err.Error()) return diff --git a/damocles-manager/ver/ver.go b/damocles-manager/ver/ver.go index e0d30f57a..8c14ac07a 100644 --- a/damocles-manager/ver/ver.go +++ b/damocles-manager/ver/ver.go @@ -9,3 +9,7 @@ var Commit string func VersionStr() string { return fmt.Sprintf("v%s-%s-%s", Version, Prover, Commit) } + +func ProverIsProd() bool { + return Prover == "prod" +} From db9b26468823aeb292d79ed5c99d695956874bcc Mon Sep 17 00:00:00 2001 From: tanlang Date: Fri, 7 Jul 2023 14:46:37 +0800 Subject: [PATCH 08/18] feat: add cli to manager wdpost task --- .../damocles-manager/internal/util_worker.go | 99 +++++++++++++++++++ damocles-manager/go.mod | 6 +- 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/damocles-manager/cmd/damocles-manager/internal/util_worker.go b/damocles-manager/cmd/damocles-manager/internal/util_worker.go index ebb1bbcfc..4d32f065f 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_worker.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_worker.go @@ -26,6 +26,7 @@ var utilWorkerCmd = &cli.Command{ utilWorkerInfoCmd, utilWorkerPauseCmd, utilWorkerResumeCmd, + utilWdPostCmd, }, } @@ -302,3 +303,101 @@ func resolveWorkerDest(ctx context.Context, a *APIClient, name string) (string, return addr.String(), nil } + +var utilWdPostCmd = &cli.Command{ + Name: "wdpost", + Usage: "manager wdpost task when the task is handle by worker", + Subcommands: []*cli.Command{ + utilWdPostListCmd, + utilWdPostResetCmd, + }, +} + +var utilWdPostListCmd = &cli.Command{ + Name: "list", + Usage: "list all wdpost task", + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "all", + Usage: "list all wdpost task, include the task that has been succeed", + }, + }, + Action: func(cctx *cli.Context) error { + a, actx, stopper, err := extractAPI(cctx) + if err != nil { + return fmt.Errorf("get api: %w", err) + } + defer stopper() + + var tasks []*core.WdPoStTask + tasks, err = a.Damocles.WdPoStAllTasks(actx) + if err != nil { + return fmt.Errorf("get wdpost tasks: %w", err) + } + + w := tabwriter.NewWriter(os.Stdout, 2, 4, 2, ' ', 0) + _, err = w.Write([]byte("ID\tMinerID\tWorker\tState\tCreateAt\tStartedAt\tHeartbeatAt\tFinishedAt\tError\n")) + if err != nil { + return err + } + for _, task := range tasks { + + state := "ReadyToRun" + if task.StartedAt != 0 { + state = "Running" + } + if task.FinishedAt != 0 { + if task.ErrorReason != "" { + state = "Failed" + } else { + state = "Succeed" + } + } + + if !cctx.Bool("all") && state == "Succeed" { + continue + } + + fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + task.ID, + task.Input.MinerID, + task.WorkerName, + state, + time.Unix(int64(task.CreatedAt), 0), + time.Unix(int64(task.StartedAt), 0), + time.Unix(int64(task.HeartbeatAt), 0), + time.Unix(int64(task.FinishedAt), 0), + task.ErrorReason, + ) + } + + w.Flush() + return nil + }, +} + +var utilWdPostResetCmd = &cli.Command{ + Name: "reset", + Usage: "reset wdpost task", + ArgsUsage: "", + Action: func(cctx *cli.Context) error { + args := cctx.Args() + if args.Len() < 1 { + return cli.ShowSubcommandHelp(cctx) + } + + id := args.First() + a, actx, stopper, err := extractAPI(cctx) + if err != nil { + return fmt.Errorf("get api: %w", err) + } + defer stopper() + + err = a.Damocles.WdPoStResetTask(actx, id) + if err != nil { + return fmt.Errorf("reset wdpost task: %w", err) + } + + return nil + }, +} diff --git a/damocles-manager/go.mod b/damocles-manager/go.mod index debde467d..bb9659518 100644 --- a/damocles-manager/go.mod +++ b/damocles-manager/go.mod @@ -5,7 +5,7 @@ go 1.18 require ( contrib.go.opencensus.io/exporter/prometheus v0.4.0 github.com/BurntSushi/toml v1.2.1 - github.com/cespare/xxhash v1.1.0 + github.com/cespare/xxhash/v2 v2.2.0 github.com/dgraph-io/badger/v2 v2.2007.3 github.com/docker/go-units v0.5.0 github.com/dtynn/dix v0.1.2 @@ -52,6 +52,7 @@ require ( go.opencensus.io v0.24.0 go.uber.org/fx v1.15.0 go.uber.org/zap v1.23.0 + golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 ) require ( @@ -64,7 +65,7 @@ require ( github.com/benbjohnson/clock v1.3.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bluele/gcache v0.0.0-20190518031135-bc40bd653833 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cespare/xxhash v1.1.0 // indirect github.com/cilium/ebpf v0.4.0 // indirect github.com/containerd/cgroups v1.0.4 // indirect github.com/coreos/go-systemd/v22 v22.4.0 // indirect @@ -214,7 +215,6 @@ require ( go.uber.org/multierr v1.8.0 // indirect go4.org v0.0.0-20200411211856-f5505b9728dd // indirect golang.org/x/crypto v0.6.0 // indirect - golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 // indirect golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 // indirect golang.org/x/mod v0.8.0 // indirect golang.org/x/net v0.7.0 // indirect From dae0505dcab67fa009ebc4c5e3b16e4c1ce1dfb6 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 7 Jul 2023 15:31:00 +0800 Subject: [PATCH 09/18] feat: split sector tracker --- .../internal/util_sealer_proving.go | 16 +- damocles-manager/core/ifaces.go | 6 +- damocles-manager/dep/poster.go | 4 +- damocles-manager/dep/sealer.go | 4 +- damocles-manager/dep/sealer_constructor.go | 4 +- .../modules/impl/sectors/proving.go | 224 ++++++++++++++++++ .../modules/impl/sectors/tracker.go | 196 +-------------- damocles-manager/modules/poster/poster.go | 14 +- damocles-manager/modules/poster/runner.go | 2 +- damocles-manager/modules/sealer/sealer.go | 8 +- damocles-manager/modules/sealer/sealer_cli.go | 4 +- 11 files changed, 260 insertions(+), 222 deletions(-) create mode 100644 damocles-manager/modules/impl/sectors/proving.go diff --git a/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go b/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go index 7df771831..74b26e7f7 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_sealer_proving.go @@ -29,7 +29,6 @@ import ( "github.com/filecoin-project/venus/venus-shared/actors/builtin/miner" "github.com/filecoin-project/venus/venus-shared/types" - ffi "github.com/filecoin-project/filecoin-ffi" "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/modules/policy" "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" @@ -830,7 +829,11 @@ var utilSealerProvingWinningVanillaCmd = &cli.Command{ }, }, Action: func(cctx *cli.Context) error { - api, actx, astop, err := extractAPI(cctx) + var ( + prover core.Prover + verifier core.Verifier + ) + api, actx, astop, err := extractAPI(cctx, &prover, &verifier) if err != nil { return err } @@ -878,7 +881,7 @@ var utilSealerProvingWinningVanillaCmd = &cli.Command{ randomness := make(abi.PoStRandomness, abi.RandomnessLength) - challenges, err := ffi.GeneratePoStFallbackSectorChallenges(abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, []abi.SectorNumber{sectorID.Number}) + challenges, err := prover.GeneratePoStFallbackSectorChallenges(actx, abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, []abi.SectorNumber{sectorID.Number}) if err != nil { return fmt.Errorf("generate challenge for sector %s: %w", sealedFileName, err) } @@ -890,7 +893,7 @@ var utilSealerProvingWinningVanillaCmd = &cli.Command{ slog.Infof("%d challenge generated", len(challenge)) - vannilla, err := ffi.GenerateSingleVanillaProof(core.FFIPrivateSectorInfo{ + vannilla, err := prover.GenerateSingleVanillaProof(actx, core.FFIPrivateSectorInfo{ SectorInfo: sectorInfo, PoStProofType: abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, CacheDirPath: cacheDirPath, @@ -902,15 +905,14 @@ var utilSealerProvingWinningVanillaCmd = &cli.Command{ slog.Infof("vannilla generated with %d bytes", len(vannilla)) - proofs, err := ffi.GenerateWinningPoStWithVanilla(abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, [][]byte{vannilla}) + proofs, err := prover.GenerateWinningPoStWithVanilla(actx, abi.RegisteredPoStProof_StackedDrgWinning32GiBV1, sectorID.Miner, randomness, [][]byte{vannilla}) if err != nil { return fmt.Errorf("generate winning post with vannilla for %s: %w", sealedFileName, err) } slog.Infof("proof generated with %d bytes", len(proofs[0].ProofBytes)) - randomness[31] &= 0x3f - verified, err := ffi.VerifyWinningPoSt(core.WinningPoStVerifyInfo{ + verified, err := verifier.VerifyWinningPoSt(actx, core.WinningPoStVerifyInfo{ Randomness: randomness, Proofs: proofs, ChallengedSectors: []core.SectorInfo{sectorInfo}, diff --git a/damocles-manager/core/ifaces.go b/damocles-manager/core/ifaces.go index 325c1ba25..1a4a53f64 100644 --- a/damocles-manager/core/ifaces.go +++ b/damocles-manager/core/ifaces.go @@ -63,9 +63,13 @@ type SectorIndexer interface { type SectorTracker interface { SinglePubToPrivateInfo(ctx context.Context, mid abi.ActorID, sectorInfo builtin.ExtendedSectorInfo, locator SectorLocator) (PrivateSectorInfo, error) SinglePrivateInfo(ctx context.Context, sref SectorRef, upgrade bool, locator SectorLocator) (PrivateSectorInfo, error) + PubToPrivate(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectorInfo []builtin.ExtendedSectorInfo) ([]FFIPrivateSectorInfo, error) +} + +type SectorProving interface { SingleProvable(ctx context.Context, postProofType abi.RegisteredPoStProof, sref SectorRef, upgrade bool, locator SectorLocator, strict, stateCheck bool) error Provable(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) - PubToPrivate(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectorInfo []builtin.ExtendedSectorInfo) ([]FFIPrivateSectorInfo, error) + SectorTracker } type SnapUpSectorManager interface { diff --git a/damocles-manager/dep/poster.go b/damocles-manager/dep/poster.go index c1706a345..423c3b5a2 100644 --- a/damocles-manager/dep/poster.go +++ b/damocles-manager/dep/poster.go @@ -25,13 +25,13 @@ func RunPoSter( scfg *modules.SafeConfig, verifier core.Verifier, prover core.Prover, - sectorTracker core.SectorTracker, + sectorProving core.SectorProving, capi chain.API, rapi core.RandomnessAPI, mapi messager.API, minerAPI core.MinerAPI, ) error { - p, err := poster.NewPoSter(scfg, capi, mapi, rapi, minerAPI, prover, verifier, sectorTracker) + p, err := poster.NewPoSter(scfg, capi, mapi, rapi, minerAPI, prover, verifier, sectorProving) if err != nil { return err } diff --git a/damocles-manager/dep/sealer.go b/damocles-manager/dep/sealer.go index 8ca2efc47..dd7a1b65f 100644 --- a/damocles-manager/dep/sealer.go +++ b/damocles-manager/dep/sealer.go @@ -12,6 +12,7 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/mock" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/prover" "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/randomness" + "github.com/ipfs-force-community/damocles/damocles-manager/modules/impl/sectors" "github.com/ipfs-force-community/damocles/damocles-manager/modules/sealer" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/chain" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/confmgr" @@ -56,7 +57,8 @@ func Product() dix.Option { dix.Override(new(core.SectorStateManager), BuildLocalSectorStateManager), dix.Override(new(core.SectorNumberAllocator), BuildSectorNumberAllocator), dix.Override(new(core.RandomnessAPI), randomness.New), - dix.Override(new(core.SectorTracker), BuildSectorTracker), + dix.Override(new(core.SectorTracker), sectors.NewTracker), + dix.Override(new(core.SectorProving), BuildSectorProving), dix.If(ver.ProverIsProd(), prodProver()), dix.If(!ver.ProverIsProd(), fakerProver()), diff --git a/damocles-manager/dep/sealer_constructor.go b/damocles-manager/dep/sealer_constructor.go index 93caf207f..c6d139324 100644 --- a/damocles-manager/dep/sealer_constructor.go +++ b/damocles-manager/dep/sealer_constructor.go @@ -491,8 +491,8 @@ func BuildSectorIndexer(storeMgr PersistedObjectStoreManager, kv SectorIndexMeta return sectors.NewIndexer(storeMgr, kv, upgrade) } -func BuildSectorTracker(indexer core.SectorIndexer, state core.SectorStateManager, prover core.Prover, capi chain.API, scfg *modules.SafeConfig) (core.SectorTracker, error) { - return sectors.NewTracker(indexer, state, prover, capi, scfg.MustCommonConfig().Proving) +func BuildSectorProving(tracker core.SectorTracker, state core.SectorStateManager, prover core.Prover, capi chain.API, scfg *modules.SafeConfig) (core.SectorProving, error) { + return sectors.NewProving(tracker, state, prover, capi, scfg.MustCommonConfig().Proving) } type MarketAPIRelatedComponents struct { diff --git a/damocles-manager/modules/impl/sectors/proving.go b/damocles-manager/modules/impl/sectors/proving.go new file mode 100644 index 000000000..7cca6f4c0 --- /dev/null +++ b/damocles-manager/modules/impl/sectors/proving.go @@ -0,0 +1,224 @@ +package sectors + +import ( + "context" + "fmt" + "math/rand" + "sync" + "time" + + "github.com/filecoin-project/go-address" + "github.com/filecoin-project/go-state-types/abi" + "github.com/filecoin-project/venus/venus-shared/actors/builtin" + "github.com/filecoin-project/venus/venus-shared/types" + "github.com/ipfs-force-community/damocles/damocles-manager/core" + "github.com/ipfs-force-community/damocles/damocles-manager/modules" + chainAPI "github.com/ipfs-force-community/damocles/damocles-manager/pkg/chain" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/objstore" + "github.com/ipfs/go-cid" +) + +func NewProving(sectorTracker core.SectorTracker, state core.SectorStateManager, prover core.Prover, capi chainAPI.API, stCfg modules.ProvingConfig) (core.SectorProving, error) { + return &Proving{ + SectorTracker: sectorTracker, + state: state, + prover: prover, + capi: capi, + + parallelCheckLimit: stCfg.ParallelCheckLimit, + singleCheckTimeout: time.Duration(stCfg.SingleCheckTimeout), + partitionCheckTimeout: time.Duration(stCfg.PartitionCheckTimeout), + }, nil +} + +type Proving struct { + core.SectorTracker + state core.SectorStateManager + storeMgr objstore.Manager + prover core.Prover + capi chainAPI.API + + parallelCheckLimit int + singleCheckTimeout time.Duration + partitionCheckTimeout time.Duration +} + +func (p *Proving) SingleProvable(ctx context.Context, postProofType abi.RegisteredPoStProof, sref core.SectorRef, upgrade bool, locator core.SectorLocator, strict, stateCheck bool) error { + ssize, err := sref.ProofType.SectorSize() + if err != nil { + return fmt.Errorf("get sector size: %w", err) + } + + privateInfo, err := p.SectorTracker.SinglePrivateInfo(ctx, sref, upgrade, locator) + if err != nil { + return fmt.Errorf("get private info: %w", err) + } + sealedFileIns, err := p.storeMgr.GetInstance(ctx, privateInfo.Accesses.SealedFile) + if err != nil { + return fmt.Errorf("get objstore instance %s for sealed file: %w", privateInfo.Accesses.SealedFile, err) + } + + cacheDirIns, err := p.storeMgr.GetInstance(ctx, privateInfo.Accesses.CacheDir) + if err != nil { + return fmt.Errorf("get objstore instance %s for cache dir: %w", privateInfo.Accesses.CacheDir, err) + } + + targetsInCacheDir := map[string]int64{} + addCachePathsForSectorSize(targetsInCacheDir, privateInfo.CacheDirURI, ssize) + + checks := []struct { + title string + store objstore.Store + targets map[string]int64 + }{ + { + title: "sealed file", + store: sealedFileIns, + targets: map[string]int64{ + privateInfo.SealedSectorURI: 1, + }, + }, + { + title: "cache dir", + store: cacheDirIns, + targets: targetsInCacheDir, + }, + } + + for _, check := range checks { + for p, sz := range check.targets { + st, err := check.store.Stat(ctx, p) + if err != nil { + return fmt.Errorf("stat object %s for %s: %w", p, check.title, err) + } + + if sz != 0 { + if st.Size != int64(ssize)*sz { + return fmt.Errorf("%s for %s with wrong size (got %d, expect %d)", p, check.title, st.Size, int64(ssize)*sz) + } + } + } + } + + if !strict { + return nil + } + + addr, err := address.NewIDAddress(uint64(sref.ID.Miner)) + if err != nil { + return err + } + sinfo, err := p.capi.StateSectorGetInfo(ctx, addr, sref.ID.Number, types.EmptyTSK) + if err != nil { + return err + } + + if stateCheck { + // local and chain consistency check + ss, err := p.state.Load(ctx, sref.ID, core.WorkerOffline) + if err != nil { + return fmt.Errorf("not exist in Offline, maybe in Online: %w", err) + } + // for snap: onChain.SealedCID == local.UpgradedInfo.SealedCID, onChain.SectorKeyCID == ss.Pre.CommR, for other(CC/DC): onChain.SealedCID == onChain.SealedCID + if !upgrade { + if !ss.Pre.CommR.Equals(sinfo.SealedCID) { + return fmt.Errorf("the SealedCID on the local and the chain is inconsistent") + } + } else { + if !sinfo.SectorKeyCID.Equals(ss.Pre.CommR) { + return fmt.Errorf("the SectorKeyCID on the local and the chain is inconsistent") + } + + // 从 lotus 导入的扇区 UpgradedInfo 是空值,见代码: damocles-manager/cmd/damocles-manager/internal/util_sealer_sectors.go#L1735 + if ss.UpgradedInfo.SealedCID != cid.Undef && !sinfo.SealedCID.Equals(ss.UpgradedInfo.SealedCID) { + return fmt.Errorf("the SealedCID on the local and the chain is inconsistent") + } + } + } + + replica := privateInfo.ToFFI(core.SectorInfo{ + SealProof: sref.ProofType, + SectorNumber: sref.ID.Number, + SealedCID: sinfo.SealedCID, + }, postProofType) + + scCtx := ctx + if p.singleCheckTimeout > 0 { + var scCancel context.CancelFunc + scCtx, scCancel = context.WithTimeout(ctx, p.singleCheckTimeout) + defer scCancel() + } + + // use randUint64 % nodeNums as challenge, notice nodeNums = ssize / 32B + _, err = p.prover.GenerateSingleVanillaProof(scCtx, replica, []uint64{rand.Uint64() % (uint64(ssize) / 32)}) + + if err != nil { + return fmt.Errorf("generate vanilla proof of %s failed: %w", sref.ID, err) + } + + return nil +} + +func (p *Proving) Provable(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) { + limit := p.parallelCheckLimit + if limit <= 0 { + limit = len(sectors) + } + throttle := make(chan struct{}, limit) + + if p.partitionCheckTimeout > 0 { + var pcCancel context.CancelFunc + ctx, pcCancel = context.WithTimeout(ctx, p.partitionCheckTimeout) + defer pcCancel() + } + + results := make([]string, len(sectors)) + var wg sync.WaitGroup + wg.Add(len(sectors)) + + for ti := range sectors { + select { + case throttle <- struct{}{}: + case <-ctx.Done(): + // After the overtime, walk through the cycle and do not turn on the thread check. + results[ti] = fmt.Sprintf("waiting for check worker: %s", ctx.Err()) + wg.Done() + continue + } + + go func(i int) { + defer wg.Done() + defer func() { + <-throttle + }() + + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + sector := sectors[i] + + sref := core.SectorRef{ + ID: abi.SectorID{Miner: mid, Number: sector.SectorNumber}, + ProofType: sector.SealProof, + } + err := p.SingleProvable(ctx, postProofType, sref, sector.SectorKey != nil, nil, strict, stateCheck) + if err == nil { + return + } + + results[i] = err.Error() + + }(ti) + } + + wg.Wait() + + bad := map[abi.SectorNumber]string{} + for ri := range results { + if results[ri] != "" { + bad[sectors[ri].SectorNumber] = results[ri] + } + } + + return bad, nil +} diff --git a/damocles-manager/modules/impl/sectors/tracker.go b/damocles-manager/modules/impl/sectors/tracker.go index 3c7adfb47..19fc444fd 100644 --- a/damocles-manager/modules/impl/sectors/tracker.go +++ b/damocles-manager/modules/impl/sectors/tracker.go @@ -3,22 +3,13 @@ package sectors import ( "context" "fmt" - "math/rand" - "sync" - "time" - "github.com/ipfs/go-cid" - - "github.com/filecoin-project/go-address" "github.com/filecoin-project/go-state-types/abi" "github.com/filecoin-project/venus/venus-shared/actors/builtin" - "github.com/filecoin-project/venus/venus-shared/types" "github.com/ipfs-force-community/damocles/damocles-manager/core" - "github.com/ipfs-force-community/damocles/damocles-manager/modules" "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" - chainAPI "github.com/ipfs-force-community/damocles/damocles-manager/pkg/chain" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/objstore" ) @@ -30,28 +21,14 @@ type sectorStoreInstances struct { cacheDir objstore.Store } -func NewTracker(indexer core.SectorIndexer, state core.SectorStateManager, prover core.Prover, capi chainAPI.API, stCfg modules.ProvingConfig) (*Tracker, error) { +func NewTracker(indexer core.SectorIndexer) (*Tracker, error) { return &Tracker{ indexer: indexer, - state: state, - prover: prover, - capi: capi, - - parallelCheckLimit: stCfg.ParallelCheckLimit, - singleCheckTimeout: time.Duration(stCfg.SingleCheckTimeout), - partitionCheckTimeout: time.Duration(stCfg.PartitionCheckTimeout), }, nil } type Tracker struct { indexer core.SectorIndexer - state core.SectorStateManager - prover core.Prover - capi chainAPI.API - - parallelCheckLimit int - singleCheckTimeout time.Duration - partitionCheckTimeout time.Duration } func (t *Tracker) SinglePubToPrivateInfo(ctx context.Context, mid abi.ActorID, sector builtin.ExtendedSectorInfo, locator core.SectorLocator) (core.PrivateSectorInfo, error) { @@ -97,177 +74,6 @@ func (t *Tracker) SinglePrivateInfo(ctx context.Context, sref core.SectorRef, up return privateInfo, nil } -func (t *Tracker) SingleProvable(ctx context.Context, postProofType abi.RegisteredPoStProof, sref core.SectorRef, upgrade bool, locator core.SectorLocator, strict, stateCheck bool) error { - ssize, err := sref.ProofType.SectorSize() - if err != nil { - return fmt.Errorf("get sector size: %w", err) - } - - instances, privateInfo, err := t.getPrivateInfo(ctx, sref, upgrade, locator) - if err != nil { - return fmt.Errorf("get private info: %w", err) - } - - targetsInCacheDir := map[string]int64{} - addCachePathsForSectorSize(targetsInCacheDir, privateInfo.CacheDirURI, ssize) - - checks := []struct { - title string - store objstore.Store - targets map[string]int64 - }{ - { - title: "sealed file", - store: instances.sealedFile, - targets: map[string]int64{ - privateInfo.SealedSectorURI: 1, - }, - }, - { - title: "cache dir", - store: instances.cacheDir, - targets: targetsInCacheDir, - }, - } - - for _, check := range checks { - for p, sz := range check.targets { - st, err := check.store.Stat(ctx, p) - if err != nil { - return fmt.Errorf("stat object %s for %s: %w", p, check.title, err) - } - - if sz != 0 { - if st.Size != int64(ssize)*sz { - return fmt.Errorf("%s for %s with wrong size (got %d, expect %d)", p, check.title, st.Size, int64(ssize)*sz) - } - } - } - } - - if !strict { - return nil - } - - addr, err := address.NewIDAddress(uint64(sref.ID.Miner)) - if err != nil { - return err - } - sinfo, err := t.capi.StateSectorGetInfo(ctx, addr, sref.ID.Number, types.EmptyTSK) - if err != nil { - return err - } - - if stateCheck { - // local and chain consistency check - ss, err := t.state.Load(ctx, sref.ID, core.WorkerOffline) - if err != nil { - return fmt.Errorf("not exist in Offline, maybe in Online: %w", err) - } - // for snap: onChain.SealedCID == local.UpgradedInfo.SealedCID, onChain.SectorKeyCID == ss.Pre.CommR, for other(CC/DC): onChain.SealedCID == onChain.SealedCID - if !upgrade { - if !ss.Pre.CommR.Equals(sinfo.SealedCID) { - return fmt.Errorf("the SealedCID on the local and the chain is inconsistent") - } - } else { - if !sinfo.SectorKeyCID.Equals(ss.Pre.CommR) { - return fmt.Errorf("the SectorKeyCID on the local and the chain is inconsistent") - } - - // 从 lotus 导入的扇区 UpgradedInfo 是空值,见代码: damocles-manager/cmd/damocles-manager/internal/util_sealer_sectors.go#L1735 - if ss.UpgradedInfo.SealedCID != cid.Undef && !sinfo.SealedCID.Equals(ss.UpgradedInfo.SealedCID) { - return fmt.Errorf("the SealedCID on the local and the chain is inconsistent") - } - } - } - - replica := privateInfo.ToFFI(core.SectorInfo{ - SealProof: sref.ProofType, - SectorNumber: sref.ID.Number, - SealedCID: sinfo.SealedCID, - }, postProofType) - - scCtx := ctx - if t.singleCheckTimeout > 0 { - var scCancel context.CancelFunc - scCtx, scCancel = context.WithTimeout(ctx, t.singleCheckTimeout) - defer scCancel() - } - - // use randUint64 % nodeNums as challenge, notice nodeNums = ssize / 32B - _, err = t.prover.GenerateSingleVanillaProof(scCtx, replica, []uint64{rand.Uint64() % (uint64(ssize) / 32)}) - - if err != nil { - return fmt.Errorf("generate vanilla proof of %s failed: %w", sref.ID, err) - } - - return nil -} - -func (t *Tracker) Provable(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) { - limit := t.parallelCheckLimit - if limit <= 0 { - limit = len(sectors) - } - throttle := make(chan struct{}, limit) - - if t.partitionCheckTimeout > 0 { - var pcCancel context.CancelFunc - ctx, pcCancel = context.WithTimeout(ctx, t.partitionCheckTimeout) - defer pcCancel() - } - - results := make([]string, len(sectors)) - var wg sync.WaitGroup - wg.Add(len(sectors)) - - for ti := range sectors { - select { - case throttle <- struct{}{}: - case <-ctx.Done(): - // After the overtime, walk through the cycle and do not turn on the thread check. - results[ti] = fmt.Sprintf("waiting for check worker: %s", ctx.Err()) - wg.Done() - continue - } - - go func(i int) { - defer wg.Done() - defer func() { - <-throttle - }() - - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - sector := sectors[i] - - sref := core.SectorRef{ - ID: abi.SectorID{Miner: mid, Number: sector.SectorNumber}, - ProofType: sector.SealProof, - } - err := t.SingleProvable(ctx, postProofType, sref, sector.SectorKey != nil, nil, strict, stateCheck) - if err == nil { - return - } - - results[i] = err.Error() - - }(ti) - } - - wg.Wait() - - bad := map[abi.SectorNumber]string{} - for ri := range results { - if results[ri] != "" { - bad[sectors[ri].SectorNumber] = results[ri] - } - } - - return bad, nil -} - func (t *Tracker) PubToPrivate(ctx context.Context, aid abi.ActorID, postProofType abi.RegisteredPoStProof, sectorInfo []builtin.ExtendedSectorInfo) ([]core.FFIPrivateSectorInfo, error) { if len(sectorInfo) == 0 { return []core.FFIPrivateSectorInfo{}, nil diff --git a/damocles-manager/modules/poster/poster.go b/damocles-manager/modules/poster/poster.go index 5b80c9814..e065917f1 100644 --- a/damocles-manager/modules/poster/poster.go +++ b/damocles-manager/modules/poster/poster.go @@ -34,7 +34,7 @@ func newPostDeps( minerAPI core.MinerAPI, prover core.Prover, verifier core.Verifier, - sectorTracker core.SectorTracker, + sectorProving core.SectorProving, ) postDeps { return postDeps{ chain: chain, @@ -44,7 +44,7 @@ func newPostDeps( clock: clock.NewSystemClock(), prover: prover, verifier: verifier, - sectorTracker: sectorTracker, + sectorProving: sectorProving, } } @@ -56,7 +56,7 @@ type postDeps struct { clock clock.Clock prover core.Prover verifier core.Verifier - sectorTracker core.SectorTracker + sectorProving core.SectorProving } func NewPoSter( @@ -67,7 +67,7 @@ func NewPoSter( minerAPI core.MinerAPI, prover core.Prover, verifier core.Verifier, - sectorTracker core.SectorTracker, + sectorProving core.SectorProving, ) (*PoSter, error) { return newPoSterWithRunnerConstructor( scfg, @@ -77,7 +77,7 @@ func NewPoSter( minerAPI, prover, verifier, - sectorTracker, + sectorProving, postRunnerConstructor, ) } @@ -90,12 +90,12 @@ func newPoSterWithRunnerConstructor( minerAPI core.MinerAPI, prover core.Prover, verifier core.Verifier, - sectorTracker core.SectorTracker, + sectorProving core.SectorProving, runnerCtor runnerConstructor, ) (*PoSter, error) { return &PoSter{ cfg: scfg, - deps: newPostDeps(chain, msg, rand, minerAPI, prover, verifier, sectorTracker), + deps: newPostDeps(chain, msg, rand, minerAPI, prover, verifier, sectorProving), schedulers: make(map[abi.ActorID]map[abi.ChainEpoch]*scheduler), runnerConstructor: runnerCtor, }, nil diff --git a/damocles-manager/modules/poster/runner.go b/damocles-manager/modules/poster/runner.go index 1c7c5d27e..46613e0cb 100644 --- a/damocles-manager/modules/poster/runner.go +++ b/damocles-manager/modules/poster/runner.go @@ -775,7 +775,7 @@ func (pr *postRunner) checkSectors(clog *logging.ZapLogger, check bitfield.BitFi } } - bad, err := pr.deps.sectorTracker.Provable(pr.ctx, pr.mid, pp, tocheck, pr.startCtx.pcfg.StrictCheck, false) + bad, err := pr.deps.sectorProving.Provable(pr.ctx, pr.mid, pp, tocheck, pr.startCtx.pcfg.StrictCheck, false) if err != nil { return bitfield.BitField{}, fmt.Errorf("checking provable sectors: %w", err) } diff --git a/damocles-manager/modules/sealer/sealer.go b/damocles-manager/modules/sealer/sealer.go index c795844ab..7b3b7b0fb 100644 --- a/damocles-manager/modules/sealer/sealer.go +++ b/damocles-manager/modules/sealer/sealer.go @@ -59,7 +59,7 @@ func New( deal core.DealManager, commit core.CommitmentManager, sectorIdxer core.SectorIndexer, - sectorTracker core.SectorTracker, + sectorProving core.SectorProving, prover core.Prover, pieceStore piecestore.PieceStore, snapup core.SnapUpSectorManager, @@ -82,7 +82,7 @@ func New( pieceStore: pieceStore, sectorIdxer: sectorIdxer, - sectorTracker: sectorTracker, + sectorProving: sectorProving, prover: prover, }, nil @@ -103,7 +103,7 @@ type Sealer struct { pieceStore piecestore.PieceStore sectorIdxer core.SectorIndexer - sectorTracker core.SectorTracker + sectorProving core.SectorProving prover core.Prover } @@ -635,7 +635,7 @@ func (s *Sealer) checkPersistedFiles(ctx context.Context, sid abi.SectorID, proo if err != nil { return false, fmt.Errorf("convert to v1_1 post proof: %w", err) } - err = s.sectorTracker.SingleProvable(ctx, ppt, core.SectorRef{ID: sid, ProofType: proofType}, upgrade, locator, false, false) + err = s.sectorProving.SingleProvable(ctx, ppt, core.SectorRef{ID: sid, ProofType: proofType}, upgrade, locator, false, false) if err != nil { if errors.Is(err, objstore.ErrObjectNotFound) { return false, nil diff --git a/damocles-manager/modules/sealer/sealer_cli.go b/damocles-manager/modules/sealer/sealer_cli.go index 880cbfb47..ecd2f77b0 100644 --- a/damocles-manager/modules/sealer/sealer_cli.go +++ b/damocles-manager/modules/sealer/sealer_cli.go @@ -57,7 +57,7 @@ func (s *Sealer) RestoreSector(ctx context.Context, sid abi.SectorID, forced boo } func (s *Sealer) CheckProvable(ctx context.Context, mid abi.ActorID, postProofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, strict, stateCheck bool) (map[abi.SectorNumber]string, error) { - return s.sectorTracker.Provable(ctx, mid, postProofType, sectors, strict, stateCheck) + return s.sectorProving.Provable(ctx, mid, postProofType, sectors, strict, stateCheck) } func (s *Sealer) SimulateWdPoSt(ctx context.Context, ddlIndex uint64, maddr address.Address, postProofType abi.RegisteredPoStProof, sis []builtin.ExtendedSectorInfo, rand abi.PoStRandomness) error { @@ -119,7 +119,7 @@ func (s *Sealer) ProvingSectorInfo(ctx context.Context, sid abi.SectorID) (core. return core.ProvingSectorInfo{}, fmt.Errorf("get sector info: %w", err) } - private, err := s.sectorTracker.SinglePubToPrivateInfo(ctx, sid.Miner, util.SectorOnChainInfoToExtended(sinfo), nil) + private, err := s.sectorProving.SinglePubToPrivateInfo(ctx, sid.Miner, util.SectorOnChainInfoToExtended(sinfo), nil) if err != nil { return core.ProvingSectorInfo{}, fmt.Errorf("get private sector info: %w", err) } From 835890a6a641fcc12854fab6ece1659e3f51fccb Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Tue, 11 Jul 2023 11:17:37 +0800 Subject: [PATCH 10/18] feat(cli: wdpost: list): show deadline --- .../damocles-manager/internal/util_worker.go | 19 +++++++++++++------ damocles-manager/core/types_wdpost.go | 3 ++- damocles-manager/dep/sealer.go | 1 + damocles-manager/dep/sealer_constructor.go | 4 ++-- .../modules/impl/prover/worker/prover.go | 6 +++--- .../modules/impl/prover/worker/task_mgr_kv.go | 3 ++- .../modules/impl/sectors/proving.go | 3 ++- 7 files changed, 25 insertions(+), 14 deletions(-) diff --git a/damocles-manager/cmd/damocles-manager/internal/util_worker.go b/damocles-manager/cmd/damocles-manager/internal/util_worker.go index 4d32f065f..1fecc5696 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_worker.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_worker.go @@ -336,10 +336,16 @@ var utilWdPostListCmd = &cli.Command{ } w := tabwriter.NewWriter(os.Stdout, 2, 4, 2, ' ', 0) - _, err = w.Write([]byte("ID\tMinerID\tWorker\tState\tCreateAt\tStartedAt\tHeartbeatAt\tFinishedAt\tError\n")) + _, err = w.Write([]byte("ID\tMinerID\tDeadline\tWorker\tState\tCreateAt\tStartedAt\tHeartbeatAt\tFinishedAt\tError\n")) if err != nil { return err } + formatDateTime := func(unix_secs uint64) string { + if unix_secs == 0 { + return "-" + } + return time.Unix(int64(unix_secs), 0).Format("01-02 15:04:05") + } for _, task := range tasks { state := "ReadyToRun" @@ -358,15 +364,16 @@ var utilWdPostListCmd = &cli.Command{ continue } - fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", task.ID, task.Input.MinerID, + task.DeadlineIdx, task.WorkerName, state, - time.Unix(int64(task.CreatedAt), 0), - time.Unix(int64(task.StartedAt), 0), - time.Unix(int64(task.HeartbeatAt), 0), - time.Unix(int64(task.FinishedAt), 0), + formatDateTime(task.CreatedAt), + formatDateTime(task.StartedAt), + formatDateTime(task.HeartbeatAt), + formatDateTime(task.FinishedAt), task.ErrorReason, ) } diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go index 0238d8165..2d45532ba 100644 --- a/damocles-manager/core/types_wdpost.go +++ b/damocles-manager/core/types_wdpost.go @@ -32,6 +32,7 @@ const ( type WdPoStTask struct { ID string + DeadlineIdx uint64 Input WdPoStInput Output *stage.WindowPoStOutput TryNum uint32 @@ -69,7 +70,7 @@ type AllocateWdPoStTaskSpec struct { type WorkerWdPoStTaskManager interface { All(ctx context.Context, filter func(*WdPoStTask) bool) ([]*WdPoStTask, error) ListByTaskIDs(ctx context.Context, state WdPoStTaskState, taskIDs ...string) ([]*WdPoStTask, error) - Create(ctx context.Context, input WdPoStInput) (*WdPoStTask, error) + Create(ctx context.Context, deadlineIdx uint64, input WdPoStInput) (*WdPoStTask, error) AllocateTasks(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) Heartbeat(ctx context.Context, taskIDs []string, workerName string) error Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error diff --git a/damocles-manager/dep/sealer.go b/damocles-manager/dep/sealer.go index dd7a1b65f..d75ebdf08 100644 --- a/damocles-manager/dep/sealer.go +++ b/damocles-manager/dep/sealer.go @@ -61,6 +61,7 @@ func Product() dix.Option { dix.Override(new(core.SectorProving), BuildSectorProving), dix.If(ver.ProverIsProd(), prodProver()), dix.If(!ver.ProverIsProd(), fakerProver()), + dix.Override(new(core.MinerAPI), BuildMinerAPI), dix.Override(new(core.CommitmentManager), BuildCommitmentManager), dix.Override(new(messager.API), BuildMessagerClient), diff --git a/damocles-manager/dep/sealer_constructor.go b/damocles-manager/dep/sealer_constructor.go index c6d139324..cdb78e902 100644 --- a/damocles-manager/dep/sealer_constructor.go +++ b/damocles-manager/dep/sealer_constructor.go @@ -491,8 +491,8 @@ func BuildSectorIndexer(storeMgr PersistedObjectStoreManager, kv SectorIndexMeta return sectors.NewIndexer(storeMgr, kv, upgrade) } -func BuildSectorProving(tracker core.SectorTracker, state core.SectorStateManager, prover core.Prover, capi chain.API, scfg *modules.SafeConfig) (core.SectorProving, error) { - return sectors.NewProving(tracker, state, prover, capi, scfg.MustCommonConfig().Proving) +func BuildSectorProving(tracker core.SectorTracker, state core.SectorStateManager, storeMgr PersistedObjectStoreManager, prover core.Prover, capi chain.API, scfg *modules.SafeConfig) (core.SectorProving, error) { + return sectors.NewProving(tracker, state, storeMgr, prover, capi, scfg.MustCommonConfig().Proving) } type MarketAPIRelatedComponents struct { diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go index f1f01b9b0..d1f9bf9f8 100644 --- a/damocles-manager/modules/impl/prover/worker/prover.go +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -2,7 +2,6 @@ package worker import ( "context" - "encoding/base64" "encoding/binary" "fmt" "sync" @@ -16,6 +15,7 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/logging" + "github.com/mr-tron/base58/base58" ) var log = logging.New("worker prover") @@ -23,7 +23,7 @@ var log = logging.New("worker prover") func GenTaskID(rawInput []byte) string { b := make([]byte, 8) binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) - return base64.URLEncoding.EncodeToString(b) + return base58.Encode(b) } type workerProver struct { @@ -177,7 +177,7 @@ func (p *workerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint6 } copy(input.Seed[:], randomness[:]) - task, err := p.taskMgr.Create(ctx, input) + task, err := p.taskMgr.Create(ctx, deadlineIdx, input) if err != nil { return nil, nil, fmt.Errorf("create wdPoSt task: %w", err) } diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go index 1810e8cd2..5a3747f55 100644 --- a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go +++ b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go @@ -79,7 +79,7 @@ func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state core.WdPoStTas return tasks, err } -func (tm *kvTaskManager) Create(ctx context.Context, input core.WdPoStInput) (*core.WdPoStTask, error) { +func (tm *kvTaskManager) Create(ctx context.Context, deadlineIdx uint64, input core.WdPoStInput) (*core.WdPoStTask, error) { var ( taskID string task *core.WdPoStTask @@ -108,6 +108,7 @@ func (tm *kvTaskManager) Create(ctx context.Context, input core.WdPoStInput) (*c now := time.Now().Unix() task = &core.WdPoStTask{ ID: taskID, + DeadlineIdx: deadlineIdx, Input: input, Output: nil, TryNum: 0, diff --git a/damocles-manager/modules/impl/sectors/proving.go b/damocles-manager/modules/impl/sectors/proving.go index 7cca6f4c0..3ab11531b 100644 --- a/damocles-manager/modules/impl/sectors/proving.go +++ b/damocles-manager/modules/impl/sectors/proving.go @@ -18,10 +18,11 @@ import ( "github.com/ipfs/go-cid" ) -func NewProving(sectorTracker core.SectorTracker, state core.SectorStateManager, prover core.Prover, capi chainAPI.API, stCfg modules.ProvingConfig) (core.SectorProving, error) { +func NewProving(sectorTracker core.SectorTracker, state core.SectorStateManager, storeMgr objstore.Manager, prover core.Prover, capi chainAPI.API, stCfg modules.ProvingConfig) (core.SectorProving, error) { return &Proving{ SectorTracker: sectorTracker, state: state, + storeMgr: storeMgr, prover: prover, capi: capi, From b2ff7ac1e5a968b781c11a51e4abd29d01be0c0e Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 14 Jul 2023 13:34:33 +0800 Subject: [PATCH 11/18] feat(cli: wdpost): improve cli --- .../damocles-manager/internal/util_worker.go | 144 +++++++++++++++--- damocles-manager/core/api.go | 7 +- damocles-manager/core/client_gen.go | 23 +-- damocles-manager/core/types_wdpost.go | 12 +- damocles-manager/core/worker.go | 18 +-- damocles-manager/dep/prover.go | 17 ++- damocles-manager/dep/sealer_constructor.go | 2 +- .../modules/impl/prover/worker/config.go | 21 +++ .../modules/impl/prover/worker/prover.go | 74 +++++---- .../modules/impl/prover/worker/rpc.go | 44 ++++-- .../modules/impl/prover/worker/task_mgr_kv.go | 66 ++++++-- damocles-manager/pkg/kvstore/kvstore.go | 4 + damocles-manager/pkg/logging/log.go | 1 + 13 files changed, 316 insertions(+), 117 deletions(-) create mode 100644 damocles-manager/modules/impl/prover/worker/config.go diff --git a/damocles-manager/cmd/damocles-manager/internal/util_worker.go b/damocles-manager/cmd/damocles-manager/internal/util_worker.go index 1fecc5696..2a9641ed3 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_worker.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_worker.go @@ -12,7 +12,6 @@ import ( "github.com/urfave/cli/v2" "github.com/ipfs-force-community/damocles/damocles-manager/core" - "github.com/ipfs-force-community/damocles/damocles-manager/modules/util" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/workercli" ) @@ -157,7 +156,7 @@ var utilWorkerInfoCmd = &cli.Command{ tw := tabwriter.NewWriter(os.Stdout, 2, 4, 2, ' ', 0) defer tw.Flush() - _, _ = fmt.Fprintln(tw, "Index\tLoc\tPlan\tSectorID\tPaused\tPausedElapsed\tState\tLastErr") + _, _ = fmt.Fprintln(tw, "Index\tLoc\tPlan\tJobID\tPaused\tPausedElapsed\tState\tLastErr") for _, detail := range details { _, _ = fmt.Fprintf( @@ -165,7 +164,7 @@ var utilWorkerInfoCmd = &cli.Command{ detail.Index, detail.Location, detail.Plan, - FormatOrNull(detail.SectorID, func() string { return util.FormatSectorID(*detail.SectorID) }), + FormatOrNull(detail.JobID, func() string { return *detail.JobID }), detail.Paused, FormatOrNull(detail.PausedElapsed, func() string { return (time.Duration(*detail.PausedElapsed) * time.Second).String() }), detail.State, @@ -310,6 +309,8 @@ var utilWdPostCmd = &cli.Command{ Subcommands: []*cli.Command{ utilWdPostListCmd, utilWdPostResetCmd, + utilWdPostRemoveCmd, + utilWdPostRemoveAllCmd, }, } @@ -321,6 +322,10 @@ var utilWdPostListCmd = &cli.Command{ Name: "all", Usage: "list all wdpost task, include the task that has been succeed", }, + &cli.BoolFlag{ + Name: "detail", + Usage: "show more detailed information", + }, }, Action: func(cctx *cli.Context) error { a, actx, stopper, err := extractAPI(cctx) @@ -335,8 +340,14 @@ var utilWdPostListCmd = &cli.Command{ return fmt.Errorf("get wdpost tasks: %w", err) } + detail := cctx.Bool("detail") + w := tabwriter.NewWriter(os.Stdout, 2, 4, 2, ' ', 0) - _, err = w.Write([]byte("ID\tMinerID\tDeadline\tWorker\tState\tCreateAt\tStartedAt\tHeartbeatAt\tFinishedAt\tError\n")) + if detail { + _, err = w.Write([]byte("ID\tPrefix\tMiner\tDDL\tWorker\tState\tTry\tCreateAt\tStartedAt\tHeartbeatAt\tFinishedAt\tUpdatedAt\tError\n")) + } else { + _, err = w.Write([]byte("ID\tMinerID\tDDL\tWorker\tState\tTry\tCreateAt\tElapsed\tError\n")) + } if err != nil { return err } @@ -347,7 +358,6 @@ var utilWdPostListCmd = &cli.Command{ return time.Unix(int64(unix_secs), 0).Format("01-02 15:04:05") } for _, task := range tasks { - state := "ReadyToRun" if task.StartedAt != 0 { state = "Running" @@ -363,19 +373,45 @@ var utilWdPostListCmd = &cli.Command{ if !cctx.Bool("all") && state == "Succeed" { continue } + if detail { + fmt.Fprintf(w, "%s\t%s\t%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n", + task.ID, + task.State, + task.Input.MinerID, + task.DeadlineIdx, + task.WorkerName, + state, + task.TryNum, + formatDateTime(task.CreatedAt), + formatDateTime(task.StartedAt), + formatDateTime(task.HeartbeatAt), + formatDateTime(task.FinishedAt), + formatDateTime(task.UpdatedAt), + task.ErrorReason, + ) + } else { + var elapsed string + + if task.StartedAt == 0 { + elapsed = "-" + } else if task.FinishedAt == 0 { + elapsed = time.Since(time.Unix(int64(task.StartedAt), 0)).Truncate(time.Second).String() + } else { + elapsed = fmt.Sprintf("%s(done)", time.Duration(task.FinishedAt-task.StartedAt)*time.Second) + } - fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", - task.ID, - task.Input.MinerID, - task.DeadlineIdx, - task.WorkerName, - state, - formatDateTime(task.CreatedAt), - formatDateTime(task.StartedAt), - formatDateTime(task.HeartbeatAt), - formatDateTime(task.FinishedAt), - task.ErrorReason, - ) + fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n", + task.ID, + task.Input.MinerID, + task.DeadlineIdx, + task.WorkerName, + state, + task.TryNum, + formatDateTime(task.CreatedAt), + elapsed, + task.ErrorReason, + ) + } } w.Flush() @@ -385,26 +421,90 @@ var utilWdPostListCmd = &cli.Command{ var utilWdPostResetCmd = &cli.Command{ Name: "reset", - Usage: "reset wdpost task", - ArgsUsage: "", + Usage: "reset the task status to allow new workers can pick it up", + ArgsUsage: "...", + Action: func(cctx *cli.Context) error { + args := cctx.Args() + if args.Len() < 1 { + return cli.ShowSubcommandHelp(cctx) + } + + a, actx, stopper, err := extractAPI(cctx) + if err != nil { + return fmt.Errorf("get api: %w", err) + } + defer stopper() + + for _, taskID := range args.Slice() { + _, err = a.Damocles.WdPoStResetTask(actx, taskID) + if err != nil { + return fmt.Errorf("reset wdpost task: %w", err) + } + } + + return nil + }, +} + +var utilWdPostRemoveCmd = &cli.Command{ + Name: "remove", + Usage: "remove wdpost task", + ArgsUsage: "...", Action: func(cctx *cli.Context) error { args := cctx.Args() if args.Len() < 1 { return cli.ShowSubcommandHelp(cctx) } - id := args.First() a, actx, stopper, err := extractAPI(cctx) if err != nil { return fmt.Errorf("get api: %w", err) } defer stopper() - err = a.Damocles.WdPoStResetTask(actx, id) + for _, taskID := range args.Slice() { + _, err = a.Damocles.WdPoStRemoveTask(actx, taskID) + if err != nil { + return fmt.Errorf("remove wdpost task: %w", err) + } + } + return nil + }, +} + +var utilWdPostRemoveAllCmd = &cli.Command{ + Name: "remove-all", + Usage: "remove all wdpost tasks", + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "really-do-it", + Usage: "Actually perform the action", + Value: false, + }, + }, + Action: func(cctx *cli.Context) error { + if !cctx.Bool("really-do-it") { + fmt.Println("Pass --really-do-it to actually execute this action") + return nil + } + + a, actx, stopper, err := extractAPI(cctx) if err != nil { - return fmt.Errorf("reset wdpost task: %w", err) + return fmt.Errorf("get api: %w", err) } + defer stopper() + tasks, err := a.Damocles.WdPoStAllTasks(actx) + if err != nil { + return err + } + for _, task := range tasks { + _, err = a.Damocles.WdPoStRemoveTask(actx, task.ID) + if err != nil { + return fmt.Errorf("remove wdpost task: %w", err) + } + fmt.Printf("wdpost task %s removed\n", task.ID) + } return nil }, } diff --git a/damocles-manager/core/api.go b/damocles-manager/core/api.go index f0c26aa04..011ef89d3 100644 --- a/damocles-manager/core/api.go +++ b/damocles-manager/core/api.go @@ -153,9 +153,10 @@ type MinerAPI interface { } type WorkerWdPoStAPI interface { - WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs []string, workerName string) error + WdPoStHeartbeatTasks(ctx context.Context, runningTaskIDs []string, workerName string) (Meta, error) WdPoStAllocateTasks(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) - WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error - WdPoStResetTask(ctx context.Context, taskID string) error + WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) + WdPoStResetTask(ctx context.Context, taskID string) (Meta, error) + WdPoStRemoveTask(ctx context.Context, taskID string) (Meta, error) WdPoStAllTasks(ctx context.Context) ([]*WdPoStTask, error) } diff --git a/damocles-manager/core/client_gen.go b/damocles-manager/core/client_gen.go index ca9e7ba9e..24264dcf0 100644 --- a/damocles-manager/core/client_gen.go +++ b/damocles-manager/core/client_gen.go @@ -4,7 +4,6 @@ package core import ( "context" - "github.com/filecoin-project/go-address" "github.com/filecoin-project/go-bitfield" "github.com/filecoin-project/go-state-types/abi" @@ -265,25 +264,29 @@ var UnavailableMinerAPIClient = MinerAPIClient{ // WorkerWdPoStAPIClient is generated client for WorkerWdPoStAPI interface. type WorkerWdPoStAPIClient struct { - WdPoStHeartbeatTask func(ctx context.Context, runningTaskIDs []string, workerName string) error - WdPoStAllocateTasks func(ctx context.Context, num uint32, workerName string) (allocatedTasks []WdPoStAllocatedTask, err error) - WdPoStFinishTask func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error - WdPoStResetTask func(ctx context.Context, taskID string) error - WdPoStAllTasks func(ctx context.Context) ([]*WdPoStTask, error) + WdPoStHeartbeatTasks func(ctx context.Context, runningTaskIDs []string, workerName string) (Meta, error) + WdPoStAllocateTasks func(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) + WdPoStFinishTask func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) + WdPoStResetTask func(ctx context.Context, taskID string) (Meta, error) + WdPoStRemoveTask func(ctx context.Context, taskID string) (Meta, error) + WdPoStAllTasks func(ctx context.Context) ([]*WdPoStTask, error) } var UnavailableWorkerWdPoStAPIClient = WorkerWdPoStAPIClient{ - WdPoStHeartbeatTask: func(ctx context.Context, runningTaskIDs []string, workerName string) error { + WdPoStHeartbeatTasks: func(ctx context.Context, runningTaskIDs []string, workerName string) (Meta, error) { + panic("WorkerWdPoStAPI client unavailable") + }, + WdPoStAllocateTasks: func(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStAllocateTasks: func(ctx context.Context, num uint32, workerName string) (allocatedTasks []WdPoStAllocatedTask, err error) { + WdPoStFinishTask: func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStFinishTask: func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { + WdPoStResetTask: func(ctx context.Context, taskID string) (Meta, error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStResetTask: func(ctx context.Context, taskID string) error { + WdPoStRemoveTask: func(ctx context.Context, taskID string) (Meta, error) { panic("WorkerWdPoStAPI client unavailable") }, WdPoStAllTasks: func(ctx context.Context) ([]*WdPoStTask, error) { diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go index 2d45532ba..730c5d578 100644 --- a/damocles-manager/core/types_wdpost.go +++ b/damocles-manager/core/types_wdpost.go @@ -9,7 +9,7 @@ import ( ) type WdPoStSectorInfo struct { - SectorID abi.SectorNumber + SectorID abi.SectorNumber `json:"SectorId"` CommR [32]byte Upgrade bool // is upgrade sector Accesses SectorAccessStores @@ -17,8 +17,8 @@ type WdPoStSectorInfo struct { type WdPoStInput struct { Sectors []WdPoStSectorInfo - MinerID abi.ActorID - ProofType abi.RegisteredPoStProof + MinerID abi.ActorID `json:"MinerId"` + ProofType string Seed [32]byte } @@ -31,7 +31,8 @@ const ( ) type WdPoStTask struct { - ID string + ID string `json:"Id"` + State string DeadlineIdx uint64 Input WdPoStInput Output *stage.WindowPoStOutput @@ -58,7 +59,7 @@ func (t *WdPoStTask) Finished(maxTry uint32) bool { } type WdPoStAllocatedTask struct { - ID string + ID string `json:"Id"` Input WdPoStInput } @@ -78,4 +79,5 @@ type WorkerWdPoStTaskManager interface { CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error Reset(ctx context.Context, taskID string) error + Remove(ctx context.Context, taskID string) error } diff --git a/damocles-manager/core/worker.go b/damocles-manager/core/worker.go index 6cb73b644..11a828f05 100644 --- a/damocles-manager/core/worker.go +++ b/damocles-manager/core/worker.go @@ -1,18 +1,16 @@ package core -import "github.com/filecoin-project/go-state-types/abi" - const DefaultWorkerListenPort = 17890 type WorkerThreadInfo struct { - Index int `json:"index"` - Location string `json:"location"` - Plan string `json:"plan"` - SectorID *abi.SectorID `json:"sector_id"` - Paused bool `json:"paused"` - PausedElapsed *uint64 `json:"paused_elapsed"` - State string `json:"state"` - LastError *string `json:"last_error"` + Index int `json:"index"` + Location string `json:"location"` + Plan string `json:"plan"` + JobID *string `json:"job_id"` + Paused bool `json:"paused"` + PausedElapsed *uint64 `json:"paused_elapsed"` + State string `json:"state"` + LastError *string `json:"last_error"` } type WorkerInfoSummary struct { diff --git a/damocles-manager/dep/prover.go b/damocles-manager/dep/prover.go index 2ab73b497..9279e6455 100644 --- a/damocles-manager/dep/prover.go +++ b/damocles-manager/dep/prover.go @@ -32,15 +32,16 @@ func ExtProver() dix.Option { func WorkerProver() dix.Option { return dix.Options( dix.Override(new(WorkerProverStore), BuildWorkerProverStore), + dix.Override(new(*proverworker.Config), proverworker.DefaultConfig), dix.Override(new(core.WorkerWdPoStTaskManager), BuildWorkerWdPoStTaskManager), dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewWdPoStAPIImpl), - dix.Override(new(core.Prover), proverworker.NewProver), + dix.Override(new(core.Prover), BuildWorkerProver), ) } func DisableWorkerProver() dix.Option { return dix.Options( - dix.Override(new(core.WorkerWdPoStAPI), &proverworker.UnavailableWdPoStAPIImpl{}), + dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewUnavailableWdPoStAPIImpl), ) } @@ -97,6 +98,18 @@ func BuildWorkerProverStore(gctx GlobalContext, db UnderlyingDB) (WorkerProverSt return db.OpenCollection(gctx, "prover") } +func BuildWorkerProver(lc fx.Lifecycle, taskMgr core.WorkerWdPoStTaskManager, sectorTracker core.SectorTracker, config *proverworker.Config) (core.Prover, error) { + p := proverworker.NewProver(taskMgr, sectorTracker, config) + lc.Append(fx.Hook{ + OnStart: func(ctx context.Context) error { + p.StartJob(ctx) + return nil + }, + }) + + return p, nil +} + func BuildWorkerWdPoStTaskManager(kv WorkerProverStore) (core.WorkerWdPoStTaskManager, error) { wdpostKV, err := kvstore.NewWrappedKVStore([]byte("wdpost-"), kv) if err != nil { diff --git a/damocles-manager/dep/sealer_constructor.go b/damocles-manager/dep/sealer_constructor.go index cdb78e902..55a68b68e 100644 --- a/damocles-manager/dep/sealer_constructor.go +++ b/damocles-manager/dep/sealer_constructor.go @@ -296,7 +296,7 @@ func MaybeAPIClient(gctx GlobalContext, lc fx.Lifecycle, listen ListenAddress) * var client core.APIClient err := buildDamoclesAPIClient(gctx, lc, core.APINamespace, &client, string(listen), false) if err != nil { - log.Errorf("failed to build api client. err: %s", err) + log.Warnf("failed to build api client. err: %s", err) client = core.UnavailableAPIClient } diff --git a/damocles-manager/modules/impl/prover/worker/config.go b/damocles-manager/modules/impl/prover/worker/config.go new file mode 100644 index 000000000..c00a45176 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/config.go @@ -0,0 +1,21 @@ +package worker + +import "time" + +type Config struct { + RetryFailedTasksInterval time.Duration + TaskMaxTry uint32 + HeartbeatTimeout time.Duration + CleanupExpiredTasksJobInterval time.Duration + TaskLifetime time.Duration +} + +func DefaultConfig() *Config { + return &Config{ + RetryFailedTasksInterval: 10 * time.Second, + TaskMaxTry: 2, + HeartbeatTimeout: 15 * time.Second, + CleanupExpiredTasksJobInterval: 30 * time.Minute, + TaskLifetime: 25 * time.Hour, + } +} diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go index d1f9bf9f8..3e1c6539d 100644 --- a/damocles-manager/modules/impl/prover/worker/prover.go +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -20,13 +20,15 @@ import ( var log = logging.New("worker prover") +var _ core.Prover = (*WorkerProver)(nil) + func GenTaskID(rawInput []byte) string { b := make([]byte, 8) binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) return base58.Encode(b) } -type workerProver struct { +type WorkerProver struct { taskMgr core.WorkerWdPoStTaskManager sectorTracker core.SectorTracker localProver core.Prover @@ -36,17 +38,11 @@ type workerProver struct { err string } inflightTasksLock *sync.Mutex - - retryFailedTasksInterval time.Duration - taskMaxTry uint32 - heartbeatTimeout time.Duration - - cleanupExpiredTasksJobInterval time.Duration - taskLifetime time.Duration + config *Config } -func NewProver(taskMgr core.WorkerWdPoStTaskManager, sectorTracker core.SectorTracker) core.Prover { - return &workerProver{ +func NewProver(taskMgr core.WorkerWdPoStTaskManager, sectorTracker core.SectorTracker, config *Config) *WorkerProver { + return &WorkerProver{ taskMgr: taskMgr, sectorTracker: sectorTracker, localProver: prover.NewProdProver(sectorTracker), @@ -55,24 +51,19 @@ func NewProver(taskMgr core.WorkerWdPoStTaskManager, sectorTracker core.SectorTr err string }), inflightTasksLock: &sync.Mutex{}, - - // TODO(0x5459): make them configurable - retryFailedTasksInterval: 10 * time.Second, - taskMaxTry: 2, - heartbeatTimeout: 15 * time.Second, - cleanupExpiredTasksJobInterval: 30 * time.Minute, - taskLifetime: 25 * time.Hour, + config: config, } } -func (p *workerProver) StartJob(ctx context.Context) { +func (p *WorkerProver) StartJob(ctx context.Context) { go p.runNotifyTaskDoneJob(ctx) go p.runRetryFailedTasksJob(ctx) go p.runCleanupExpiredTasksJob(ctx) } -func (p *workerProver) runNotifyTaskDoneJob(ctx context.Context) { +func (p *WorkerProver) runNotifyTaskDoneJob(ctx context.Context) { ticker := time.NewTicker(3 * time.Second) + defer ticker.Stop() for { select { case <-ctx.Done(): @@ -96,7 +87,7 @@ func (p *workerProver) runNotifyTaskDoneJob(ctx context.Context) { if !ok { continue } - if !task.Finished(p.taskMaxTry) { + if !task.Finished(p.config.TaskMaxTry) { continue } for _, ch := range chs { @@ -114,42 +105,45 @@ func (p *workerProver) runNotifyTaskDoneJob(ctx context.Context) { } } -func (p *workerProver) runRetryFailedTasksJob(ctx context.Context) { - ticker := time.NewTicker(p.retryFailedTasksInterval) +func (p *WorkerProver) runRetryFailedTasksJob(ctx context.Context) { + ticker := time.NewTicker(p.config.RetryFailedTasksInterval) + defer ticker.Stop() for { + if err := p.taskMgr.MakeTasksDie(ctx, p.config.HeartbeatTimeout, 128); err != nil { + log.Errorf("failed to make tasks die: %s", err) + } + if err := p.taskMgr.RetryFailedTasks(ctx, p.config.TaskMaxTry, 128); err != nil { + log.Errorf("failed to retry failed tasks: %s", err) + } select { case <-ctx.Done(): return case <-ticker.C: - if err := p.taskMgr.MakeTasksDie(ctx, p.heartbeatTimeout, 128); err != nil { - log.Errorf("failed to make tasks die: %s", err) - } - if err := p.taskMgr.RetryFailedTasks(ctx, p.taskMaxTry, 128); err != nil { - log.Errorf("failed to retry failed tasks: %s", err) - } + continue } } } -func (p *workerProver) runCleanupExpiredTasksJob(ctx context.Context) { - ticker := time.NewTicker(p.cleanupExpiredTasksJobInterval) +func (p *WorkerProver) runCleanupExpiredTasksJob(ctx context.Context) { + ticker := time.NewTicker(p.config.CleanupExpiredTasksJobInterval) for { + if err := p.taskMgr.CleanupExpiredTasks(ctx, p.config.TaskLifetime, 128); err != nil { + log.Errorf("failed to cleanup expired tasks: %s", err) + } select { case <-ctx.Done(): return case <-ticker.C: - if err := p.taskMgr.CleanupExpiredTasks(ctx, p.taskLifetime, 128); err != nil { - log.Errorf("failed to cleanup expired tasks: %s", err) - } + continue } } } -func (p *workerProver) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { +func (p *WorkerProver) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { return p.localProver.AggregateSealProofs(ctx, aggregateInfo, proofs) } -func (p *workerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { +func (p *WorkerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { sis := make([]core.WdPoStSectorInfo, len(sectors)) for i, s := range sectors { @@ -172,7 +166,7 @@ func (p *workerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint6 input := core.WdPoStInput{ MinerID: minerID, - ProofType: proofType, + ProofType: stage.ProofType2String(proofType), Sectors: sis, } copy(input.Seed[:], randomness[:]) @@ -223,18 +217,18 @@ func (p *workerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint6 return proofs, nil, nil } -func (p *workerProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { +func (p *WorkerProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { return p.localProver.GenerateWinningPoSt(ctx, minerID, proofType, sectors, randomness) } -func (p *workerProver) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { +func (p *WorkerProver) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { return p.localProver.GeneratePoStFallbackSectorChallenges(ctx, proofType, minerID, randomness, sectorIds) } -func (p *workerProver) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { +func (p *WorkerProver) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { return p.localProver.GenerateSingleVanillaProof(ctx, replica, challenges) } -func (p *workerProver) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { +func (p *WorkerProver) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { return p.localProver.GenerateWinningPoStWithVanilla(ctx, proofType, minerID, randomness, proofs) } diff --git a/damocles-manager/modules/impl/prover/worker/rpc.go b/damocles-manager/modules/impl/prover/worker/rpc.go index e70d618e9..d1ad27a18 100644 --- a/damocles-manager/modules/impl/prover/worker/rpc.go +++ b/damocles-manager/modules/impl/prover/worker/rpc.go @@ -8,53 +8,69 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" ) -func NewWdPoStAPIImpl(taskMgr core.WorkerWdPoStTaskManager) core.WorkerWdPoStAPI { +func NewWdPoStAPIImpl(taskMgr core.WorkerWdPoStTaskManager, config *Config) core.WorkerWdPoStAPI { return &WdPoStAPIImpl{ taskMgr: taskMgr, + config: config, } } type WdPoStAPIImpl struct { taskMgr core.WorkerWdPoStTaskManager + config *Config } -func (api WdPoStAPIImpl) WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs []string, workerName string) error { - return api.taskMgr.Heartbeat(ctx, runningTaskIDs, workerName) +func (api WdPoStAPIImpl) WdPoStHeartbeatTasks(ctx context.Context, runningTaskIDs []string, workerName string) (core.Meta, error) { + return nil, api.taskMgr.Heartbeat(ctx, runningTaskIDs, workerName) } func (api WdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*core.WdPoStAllocatedTask, err error) { return api.taskMgr.AllocateTasks(ctx, spec, num, workerName) } -func (api WdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { - return api.taskMgr.Finish(ctx, taskID, output, errorReason) +func (api WdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (core.Meta, error) { + return nil, api.taskMgr.Finish(ctx, taskID, output, errorReason) } -func (api WdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) error { - return api.taskMgr.Reset(ctx, taskID) +func (api WdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) (core.Meta, error) { + // TODO(0x5459): return a friendlier error if taskID not exists + return nil, api.taskMgr.Reset(ctx, taskID) +} + +func (api WdPoStAPIImpl) WdPoStRemoveTask(ctx context.Context, taskID string) (core.Meta, error) { + // TODO(0x5459): return a friendlier error if taskID not exists + return nil, api.taskMgr.Remove(ctx, taskID) } func (api WdPoStAPIImpl) WdPoStAllTasks(ctx context.Context) ([]*core.WdPoStTask, error) { return api.taskMgr.All(ctx, func(_ *core.WdPoStTask) bool { return true }) } +func NewUnavailableWdPoStAPIImpl(taskMgr core.WorkerWdPoStTaskManager) core.WorkerWdPoStAPI { + return &UnavailableWdPoStAPIImpl{} +} + // TODO(0x5459): UnavailableWdPoStAPIImpl should be automatically generated type UnavailableWdPoStAPIImpl struct{} -func (UnavailableWdPoStAPIImpl) WdPoStHeartbeatTask(ctx context.Context, runningTaskIDs []string, workerName string) error { - return fmt.Errorf("WdPoStAPI unavailable") +func (UnavailableWdPoStAPIImpl) WdPoStHeartbeatTasks(ctx context.Context, runningTaskIDs []string, workerName string) (core.Meta, error) { + return nil, fmt.Errorf("WdPoStAPI unavailable") +} + +func (UnavailableWdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*core.WdPoStAllocatedTask, err error) { + return nil, fmt.Errorf("WdPoStAPI unavailable") } -func (UnavailableWdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []core.WdPoStAllocatedTask, err error) { +func (UnavailableWdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (core.Meta, error) { return nil, fmt.Errorf("WdPoStAPI unavailable") } -func (UnavailableWdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { - return fmt.Errorf("WdPoStAPI unavailable") +func (UnavailableWdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) (core.Meta, error) { + return nil, fmt.Errorf("WdPoStAPI unavailable") } -func (UnavailableWdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) error { - return fmt.Errorf("WdPoStAPI unavailable") +func (UnavailableWdPoStAPIImpl) WdPoStRemoveTask(ctx context.Context, taskID string) (core.Meta, error) { + return nil, fmt.Errorf("WdPoStAPI unavailable") } func (UnavailableWdPoStAPIImpl) WdPoStAllTasks(ctx context.Context) ([]*core.WdPoStTask, error) { diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go index 5a3747f55..4d294c3c0 100644 --- a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go +++ b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go @@ -6,9 +6,11 @@ import ( "errors" "fmt" "math" + "sort" "strings" "time" + "github.com/filecoin-project/go-state-types/abi" "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" @@ -33,7 +35,7 @@ func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state c return } defer it.Close() - for it.Next() && len(tasks) <= int(limit) { + for it.Next() && len(tasks) < int(limit) { var task core.WdPoStTask if err = it.View(ctx, kvstore.LoadJSON(&task)); err != nil { return @@ -57,6 +59,9 @@ func (tm *kvTaskManager) All(ctx context.Context, filter func(*core.WdPoStTask) } return err }) + sort.Slice(tasks, func(i, j int) bool { + return tasks[i].CreatedAt > tasks[j].CreatedAt + }) return } @@ -108,6 +113,7 @@ func (tm *kvTaskManager) Create(ctx context.Context, deadlineIdx uint64, input c now := time.Now().Unix() task = &core.WdPoStTask{ ID: taskID, + State: string(core.WdPoStTaskReadyToRun), DeadlineIdx: deadlineIdx, Input: input, Output: nil, @@ -137,7 +143,9 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, spec core.AllocateWd if len(spec.AllowedMiners) > 0 && !slices.Contains(spec.AllowedMiners, t.Input.MinerID) { return false } - if len(spec.AllowedProofTypes) > 0 && !slices.Contains(spec.AllowedProofTypes, t.Input.ProofType) { + if len(spec.AllowedProofTypes) > 0 && !slices.ContainsFunc(spec.AllowedProofTypes, func(allowed abi.RegisteredPoStProof) bool { + return stage.ProofType2String(allowed) == t.Input.ProofType + }) { return false } return true @@ -147,15 +155,16 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, spec core.AllocateWd } now := uint64(time.Now().Unix()) for _, task := range readyToRun { + // Moving ready to run tasks to running tasks + if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, task.ID))); err != nil { + return err + } + task.State = string(core.WdPoStTaskRunning) task.TryNum++ task.StartedAt = now task.WorkerName = workerName task.HeartbeatAt = now task.UpdatedAt = now - // Moving ready to run tasks to running tasks - if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, task.ID))); err != nil { - return err - } if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskRunning, task.ID)), task); err != nil { return err } @@ -176,13 +185,16 @@ func (tm *kvTaskManager) AllocateTasks(ctx context.Context, spec core.AllocateWd } func (tm *kvTaskManager) Heartbeat(ctx context.Context, taskIDs []string, workerName string) error { + now := uint64(time.Now().Unix()) err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { for _, taskID := range taskIDs { var task core.WdPoStTask if err := txn.Peek([]byte(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), kvstore.LoadJSON(&task)); err != nil { return err } - now := uint64(time.Now().Unix()) + if task.StartedAt == 0 { + task.StartedAt = now + } task.HeartbeatAt = now task.WorkerName = workerName task.UpdatedAt = now @@ -209,6 +221,7 @@ func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stag return err } now := uint64(time.Now().Unix()) + task.State = string(core.WdPoStTaskFinished) task.Output = output task.ErrorReason = errorReason task.FinishedAt = now @@ -243,6 +256,7 @@ func (tm *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskRunning, task.ID))); err != nil { return err } + task.State = string(core.WdPoStTaskFinished) task.FinishedAt = now task.Output = nil task.ErrorReason = "heartbeat timeout" @@ -254,6 +268,12 @@ func (tm *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time return nil }) + if err == nil { + for _, task := range shouldDead { + log.Infof("make wdPoSt task die: %s; heartbeat_at: %s", task.ID, time.Unix(int64(task.HeartbeatAt), 0).Format(time.RFC3339)) + } + } + return err } @@ -290,7 +310,7 @@ func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uin err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { var err error shouldRetry, err = tm.filter(ctx, txn, core.WdPoStTaskFinished, limit, func(t *core.WdPoStTask) bool { - return len(t.ErrorReason) != 0 && t.TryNum > maxTry + return len(t.ErrorReason) != 0 && t.TryNum < maxTry }) if err != nil { return err @@ -298,11 +318,12 @@ func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uin now := uint64(time.Now().Unix()) for _, task := range shouldRetry { task.ErrorReason = "" + task.State = string(core.WdPoStTaskReadyToRun) task.Output = nil task.StartedAt = 0 task.FinishedAt = 0 task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskFinished, task.ID)), task); err != nil { + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, task.ID)), task); err != nil { return err } } @@ -311,7 +332,7 @@ func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uin if err == nil { for _, task := range shouldRetry { - log.Debugf("retry wdPoSt task: %d; try_num: %d, error_reason: %s", task.ID, task.TryNum) + log.Debugf("retry wdPoSt task: %s; try_num: %d, error_reason: %s", task.ID, task.TryNum, task.ErrorReason) } } @@ -333,6 +354,7 @@ func (tm *kvTaskManager) Reset(ctx context.Context, taskID string) error { return fmt.Errorf("load task from db: %w. taskID: %s", err, taskID) } + task.State = string(core.WdPoStTaskReadyToRun) task.CreatedAt = now task.StartedAt = 0 task.TryNum = 0 @@ -356,6 +378,30 @@ func (tm *kvTaskManager) Reset(ctx context.Context, taskID string) error { return err } +func (tm *kvTaskManager) Remove(ctx context.Context, taskID string) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + key, err := txn.PeekAny( + kvstore.NilF, + kvstore.Key(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), + kvstore.Key(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), + ) + if errors.Is(err, kvstore.ErrKeyNotFound) { + return nil + } + if err != nil { + return fmt.Errorf("load task from db: %w. taskID: %s", err, taskID) + } + return txn.Del(key) + }) + + if err == nil { + log.Infof("task removed: %s", taskID) + } + + return err +} + const ( prefixTaskIDdelimiter = ":" ) diff --git a/damocles-manager/pkg/kvstore/kvstore.go b/damocles-manager/pkg/kvstore/kvstore.go index 4ed46bb57..f0ee199e1 100644 --- a/damocles-manager/pkg/kvstore/kvstore.go +++ b/damocles-manager/pkg/kvstore/kvstore.go @@ -35,6 +35,10 @@ var LoadJSON = func(target any) func(Val) error { } } +var NilF = func(Val) error { + return nil +} + func NewKVExt(kvStore KVStore) *KVExt { return &KVExt{ KVStore: kvStore, diff --git a/damocles-manager/pkg/logging/log.go b/damocles-manager/pkg/logging/log.go index ae0565ea4..429bde55e 100644 --- a/damocles-manager/pkg/logging/log.go +++ b/damocles-manager/pkg/logging/log.go @@ -21,6 +21,7 @@ func Setup() { _ = logging.SetLogLevel("*", "INFO") _ = logging.SetLogLevel("dix", "INFO") _ = logging.SetLogLevel("badger", "INFO") + _ = logging.SetLogLevel("kv", "INFO") _ = logging.SetLogLevel("rpc", "INFO") // copy from lotus From b75e0c041069ea6a87f6e3ff7d258391669f6316 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Thu, 27 Jul 2023 11:31:27 +0800 Subject: [PATCH 12/18] feat: worker wdpost --- .../cmd/damocles-manager/daemon.go | 11 +- .../damocles-manager/internal/util_worker.go | 110 ++-- .../cmd/damocles-manager/server.go | 27 +- damocles-manager/core/api.go | 14 +- damocles-manager/core/client_gen.go | 24 +- damocles-manager/core/types_wdpost.go | 65 ++- damocles-manager/dep/prover.go | 18 +- .../modules/impl/prover/worker/config.go | 20 +- .../modules/impl/prover/worker/job_mgr_kv.go | 425 ++++++++++++++++ .../impl/prover/worker/job_mgr_kv_test.go | 18 + .../modules/impl/prover/worker/prover.go | 134 ++--- .../modules/impl/prover/worker/rpc.go | 77 +-- .../modules/impl/prover/worker/task_mgr_kv.go | 421 ---------------- .../impl/prover/worker/task_mgr_kv_test.go | 18 - damocles-worker/Cargo.lock | 21 +- damocles-worker/Cargo.toml | 11 +- .../src/bin/damocles-worker/worker/mod.rs | 6 +- damocles-worker/src/config.rs | 5 +- damocles-worker/src/metadb/mod.rs | 4 +- damocles-worker/src/rpc/sealer/mod.rs | 78 +-- damocles-worker/src/rpc/worker/mod.rs | 4 +- damocles-worker/src/sealing/config.rs | 3 +- damocles-worker/src/sealing/mod.rs | 1 + damocles-worker/src/sealing/paths.rs | 23 + .../src/sealing/sealing_thread/ctrl.rs | 15 +- .../sealing_thread/{task => }/entry.rs | 0 .../src/sealing/sealing_thread/mod.rs | 160 +++++- .../sealing/sealing_thread/planner/common.rs | 227 +++++++++ .../{task => planner/common}/event.rs | 35 +- .../common.rs => planner/common/sealing.rs} | 129 ++--- .../{task => planner/common}/sector.rs | 16 +- .../sealing_thread/planner/common/task.rs | 216 ++++++++ .../src/sealing/sealing_thread/planner/mod.rs | 73 +++ .../{task => }/planner/rebuild.rs | 64 ++- .../{task => }/planner/sealer.rs | 141 +++--- .../{task => }/planner/snapup.rs | 93 ++-- .../{task => }/planner/unseal.rs | 212 +++++--- .../sealing/sealing_thread/planner/wdpost.rs | 473 ++++++++++++++++++ .../src/sealing/sealing_thread/task/mod.rs | 421 ---------------- .../sealing_thread/task/planner/mod.rs | 76 --- .../sealing_thread/task/planner/wdpost.rs | 271 ---------- .../sealing/sealing_thread/{task => }/util.rs | 25 +- damocles-worker/src/sealing/service.rs | 15 +- damocles-worker/src/types.rs | 16 + 44 files changed, 2374 insertions(+), 1842 deletions(-) create mode 100644 damocles-manager/modules/impl/prover/worker/job_mgr_kv.go create mode 100644 damocles-manager/modules/impl/prover/worker/job_mgr_kv_test.go delete mode 100644 damocles-manager/modules/impl/prover/worker/task_mgr_kv.go delete mode 100644 damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go create mode 100644 damocles-worker/src/sealing/paths.rs rename damocles-worker/src/sealing/sealing_thread/{task => }/entry.rs (100%) create mode 100644 damocles-worker/src/sealing/sealing_thread/planner/common.rs rename damocles-worker/src/sealing/sealing_thread/{task => planner/common}/event.rs (91%) rename damocles-worker/src/sealing/sealing_thread/{task/planner/common.rs => planner/common/sealing.rs} (77%) rename damocles-worker/src/sealing/sealing_thread/{task => planner/common}/sector.rs (94%) create mode 100644 damocles-worker/src/sealing/sealing_thread/planner/common/task.rs create mode 100644 damocles-worker/src/sealing/sealing_thread/planner/mod.rs rename damocles-worker/src/sealing/sealing_thread/{task => }/planner/rebuild.rs (77%) rename damocles-worker/src/sealing/sealing_thread/{task => }/planner/sealer.rs (76%) rename damocles-worker/src/sealing/sealing_thread/{task => }/planner/snapup.rs (80%) rename damocles-worker/src/sealing/sealing_thread/{task => }/planner/unseal.rs (66%) create mode 100644 damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs delete mode 100644 damocles-worker/src/sealing/sealing_thread/task/mod.rs delete mode 100644 damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs delete mode 100644 damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs rename damocles-worker/src/sealing/sealing_thread/{task => }/util.rs (82%) diff --git a/damocles-manager/cmd/damocles-manager/daemon.go b/damocles-manager/cmd/damocles-manager/daemon.go index 1154e8af5..6f9e58b48 100644 --- a/damocles-manager/cmd/damocles-manager/daemon.go +++ b/damocles-manager/cmd/damocles-manager/daemon.go @@ -128,11 +128,16 @@ var daemonRunCmd = &cli.Command{ dep.Miner(), ), dep.Gateway(), + dix.Override(new(*APIService), NewAPIServiceDisbaleWorkerWdPoSt), + dix.If(extProver, dep.ExtProver()), - dix.If(workerProver, dep.WorkerProver()), - dix.If(!workerProver, dep.DisableWorkerProver()), + dix.If( + workerProver, + dep.WorkerProver(), + dix.Override(new(*APIService), NewAPIService), + ), dep.Sealer(), - dix.Override(new(*APIService), NewAPIService), + dix.Populate(dep.InvokePopulate, &apiService), ) if err != nil { diff --git a/damocles-manager/cmd/damocles-manager/internal/util_worker.go b/damocles-manager/cmd/damocles-manager/internal/util_worker.go index 2a9641ed3..bd4586856 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_worker.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_worker.go @@ -305,7 +305,7 @@ func resolveWorkerDest(ctx context.Context, a *APIClient, name string) (string, var utilWdPostCmd = &cli.Command{ Name: "wdpost", - Usage: "manager wdpost task when the task is handle by worker", + Usage: "manager wdpost jobs if the jobs is handle by worker", Subcommands: []*cli.Command{ utilWdPostListCmd, utilWdPostResetCmd, @@ -316,11 +316,11 @@ var utilWdPostCmd = &cli.Command{ var utilWdPostListCmd = &cli.Command{ Name: "list", - Usage: "list all wdpost task", + Usage: "list all wdpost job", Flags: []cli.Flag{ &cli.BoolFlag{ Name: "all", - Usage: "list all wdpost task, include the task that has been succeed", + Usage: "list all wdpost job, include the job that has been succeed", }, &cli.BoolFlag{ Name: "detail", @@ -334,10 +334,10 @@ var utilWdPostListCmd = &cli.Command{ } defer stopper() - var tasks []*core.WdPoStTask - tasks, err = a.Damocles.WdPoStAllTasks(actx) + var jobs []*core.WdPoStJob + jobs, err = a.Damocles.WdPoStAllJobs(actx) if err != nil { - return fmt.Errorf("get wdpost tasks: %w", err) + return fmt.Errorf("get wdpost jobs: %w", err) } detail := cctx.Bool("detail") @@ -357,59 +357,47 @@ var utilWdPostListCmd = &cli.Command{ } return time.Unix(int64(unix_secs), 0).Format("01-02 15:04:05") } - for _, task := range tasks { - state := "ReadyToRun" - if task.StartedAt != 0 { - state = "Running" - } - if task.FinishedAt != 0 { - if task.ErrorReason != "" { - state = "Failed" - } else { - state = "Succeed" - } - } - - if !cctx.Bool("all") && state == "Succeed" { + for _, job := range jobs { + if !cctx.Bool("all") && job.Succeed() { continue } if detail { fmt.Fprintf(w, "%s\t%s\t%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n", - task.ID, - task.State, - task.Input.MinerID, - task.DeadlineIdx, - task.WorkerName, - state, - task.TryNum, - formatDateTime(task.CreatedAt), - formatDateTime(task.StartedAt), - formatDateTime(task.HeartbeatAt), - formatDateTime(task.FinishedAt), - formatDateTime(task.UpdatedAt), - task.ErrorReason, + job.ID, + job.State, + job.Input.MinerID, + job.DeadlineIdx, + job.WorkerName, + job.DisplayState(), + job.TryNum, + formatDateTime(job.CreatedAt), + formatDateTime(job.StartedAt), + formatDateTime(job.HeartbeatAt), + formatDateTime(job.FinishedAt), + formatDateTime(job.UpdatedAt), + job.ErrorReason, ) } else { var elapsed string - if task.StartedAt == 0 { + if job.StartedAt == 0 { elapsed = "-" - } else if task.FinishedAt == 0 { - elapsed = time.Since(time.Unix(int64(task.StartedAt), 0)).Truncate(time.Second).String() + } else if job.FinishedAt == 0 { + elapsed = time.Since(time.Unix(int64(job.StartedAt), 0)).Truncate(time.Second).String() } else { - elapsed = fmt.Sprintf("%s(done)", time.Duration(task.FinishedAt-task.StartedAt)*time.Second) + elapsed = fmt.Sprintf("%s(done)", time.Duration(job.FinishedAt-job.StartedAt)*time.Second) } fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n", - task.ID, - task.Input.MinerID, - task.DeadlineIdx, - task.WorkerName, - state, - task.TryNum, - formatDateTime(task.CreatedAt), + job.ID, + job.Input.MinerID, + job.DeadlineIdx, + job.WorkerName, + job.DisplayState(), + job.TryNum, + formatDateTime(job.CreatedAt), elapsed, - task.ErrorReason, + job.ErrorReason, ) } } @@ -421,8 +409,8 @@ var utilWdPostListCmd = &cli.Command{ var utilWdPostResetCmd = &cli.Command{ Name: "reset", - Usage: "reset the task status to allow new workers can pick it up", - ArgsUsage: "...", + Usage: "reset the job status to allow new workers can pick it up", + ArgsUsage: "...", Action: func(cctx *cli.Context) error { args := cctx.Args() if args.Len() < 1 { @@ -435,10 +423,10 @@ var utilWdPostResetCmd = &cli.Command{ } defer stopper() - for _, taskID := range args.Slice() { - _, err = a.Damocles.WdPoStResetTask(actx, taskID) + for _, jobID := range args.Slice() { + _, err = a.Damocles.WdPoStResetJob(actx, jobID) if err != nil { - return fmt.Errorf("reset wdpost task: %w", err) + return fmt.Errorf("reset wdpost job: %w", err) } } @@ -448,8 +436,8 @@ var utilWdPostResetCmd = &cli.Command{ var utilWdPostRemoveCmd = &cli.Command{ Name: "remove", - Usage: "remove wdpost task", - ArgsUsage: "...", + Usage: "remove wdpost job", + ArgsUsage: "...", Action: func(cctx *cli.Context) error { args := cctx.Args() if args.Len() < 1 { @@ -462,10 +450,10 @@ var utilWdPostRemoveCmd = &cli.Command{ } defer stopper() - for _, taskID := range args.Slice() { - _, err = a.Damocles.WdPoStRemoveTask(actx, taskID) + for _, jobID := range args.Slice() { + _, err = a.Damocles.WdPoStRemoveJob(actx, jobID) if err != nil { - return fmt.Errorf("remove wdpost task: %w", err) + return fmt.Errorf("remove wdpost job: %w", err) } } return nil @@ -474,7 +462,7 @@ var utilWdPostRemoveCmd = &cli.Command{ var utilWdPostRemoveAllCmd = &cli.Command{ Name: "remove-all", - Usage: "remove all wdpost tasks", + Usage: "remove all wdpost jobs", Flags: []cli.Flag{ &cli.BoolFlag{ Name: "really-do-it", @@ -494,16 +482,16 @@ var utilWdPostRemoveAllCmd = &cli.Command{ } defer stopper() - tasks, err := a.Damocles.WdPoStAllTasks(actx) + jobs, err := a.Damocles.WdPoStAllJobs(actx) if err != nil { return err } - for _, task := range tasks { - _, err = a.Damocles.WdPoStRemoveTask(actx, task.ID) + for _, job := range jobs { + _, err = a.Damocles.WdPoStRemoveJob(actx, job.ID) if err != nil { - return fmt.Errorf("remove wdpost task: %w", err) + return fmt.Errorf("remove wdpost job: %w", err) } - fmt.Printf("wdpost task %s removed\n", task.ID) + fmt.Printf("wdpost job %s removed\n", job.ID) } return nil }, diff --git a/damocles-manager/cmd/damocles-manager/server.go b/damocles-manager/cmd/damocles-manager/server.go index 53de27ea0..5e2f3fb0c 100644 --- a/damocles-manager/cmd/damocles-manager/server.go +++ b/damocles-manager/cmd/damocles-manager/server.go @@ -43,13 +43,38 @@ func NewAPIService( } } +func NewAPIServiceDisbaleWorkerWdPoSt( + sealerAPI core.SealerAPI, + sealerCliAPI core.SealerCliAPI, + randomnessAPI core.RandomnessAPI, + minerAPI core.MinerAPI, + plugins *managerplugin.LoadedPlugins, +) *APIService { + type coreAPI struct { + core.SealerAPI + core.SealerCliAPI + core.RandomnessAPI + core.MinerAPI + } + + return &APIService{ + coreAPI: &coreAPI{ + SealerAPI: sealerAPI, + SealerCliAPI: sealerCliAPI, + RandomnessAPI: randomnessAPI, + MinerAPI: minerAPI, + }, + plugins: plugins, + } +} + type handler struct { namespace string hdl interface{} } type APIService struct { - coreAPI core.API + coreAPI interface{} plugins *managerplugin.LoadedPlugins } diff --git a/damocles-manager/core/api.go b/damocles-manager/core/api.go index 011ef89d3..3c8dce840 100644 --- a/damocles-manager/core/api.go +++ b/damocles-manager/core/api.go @@ -27,7 +27,7 @@ var Empty Meta type Meta *struct{} -type API interface { +type APIFull interface { SealerAPI SealerCliAPI RandomnessAPI @@ -153,10 +153,10 @@ type MinerAPI interface { } type WorkerWdPoStAPI interface { - WdPoStHeartbeatTasks(ctx context.Context, runningTaskIDs []string, workerName string) (Meta, error) - WdPoStAllocateTasks(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) - WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) - WdPoStResetTask(ctx context.Context, taskID string) (Meta, error) - WdPoStRemoveTask(ctx context.Context, taskID string) (Meta, error) - WdPoStAllTasks(ctx context.Context) ([]*WdPoStTask, error) + WdPoStHeartbeatJobs(ctx context.Context, runningJobIDs []string, workerName string) (Meta, error) + WdPoStAllocateJobs(ctx context.Context, spec AllocateWdPoStJobSpec, num uint32, workerName string) (allocatedJobs []*WdPoStAllocatedJob, err error) + WdPoStFinishJob(ctx context.Context, jobID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) + WdPoStResetJob(ctx context.Context, jobID string) (Meta, error) + WdPoStRemoveJob(ctx context.Context, jobID string) (Meta, error) + WdPoStAllJobs(ctx context.Context) ([]*WdPoStJob, error) } diff --git a/damocles-manager/core/client_gen.go b/damocles-manager/core/client_gen.go index 24264dcf0..2bac02098 100644 --- a/damocles-manager/core/client_gen.go +++ b/damocles-manager/core/client_gen.go @@ -264,32 +264,32 @@ var UnavailableMinerAPIClient = MinerAPIClient{ // WorkerWdPoStAPIClient is generated client for WorkerWdPoStAPI interface. type WorkerWdPoStAPIClient struct { - WdPoStHeartbeatTasks func(ctx context.Context, runningTaskIDs []string, workerName string) (Meta, error) - WdPoStAllocateTasks func(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) - WdPoStFinishTask func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) - WdPoStResetTask func(ctx context.Context, taskID string) (Meta, error) - WdPoStRemoveTask func(ctx context.Context, taskID string) (Meta, error) - WdPoStAllTasks func(ctx context.Context) ([]*WdPoStTask, error) + WdPoStHeartbeatJobs func(ctx context.Context, runningJobIDs []string, workerName string) (Meta, error) + WdPoStAllocateJobs func(ctx context.Context, spec AllocateWdPoStJobSpec, num uint32, workerName string) (allocatedJobs []*WdPoStAllocatedJob, err error) + WdPoStFinishJob func(ctx context.Context, jobID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) + WdPoStResetJob func(ctx context.Context, jobID string) (Meta, error) + WdPoStRemoveJob func(ctx context.Context, jobID string) (Meta, error) + WdPoStAllJobs func(ctx context.Context) ([]*WdPoStJob, error) } var UnavailableWorkerWdPoStAPIClient = WorkerWdPoStAPIClient{ - WdPoStHeartbeatTasks: func(ctx context.Context, runningTaskIDs []string, workerName string) (Meta, error) { + WdPoStHeartbeatJobs: func(ctx context.Context, runningJobIDs []string, workerName string) (Meta, error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStAllocateTasks: func(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) { + WdPoStAllocateJobs: func(ctx context.Context, spec AllocateWdPoStJobSpec, num uint32, workerName string) (allocatedJobs []*WdPoStAllocatedJob, err error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStFinishTask: func(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) { + WdPoStFinishJob: func(ctx context.Context, jobID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStResetTask: func(ctx context.Context, taskID string) (Meta, error) { + WdPoStResetJob: func(ctx context.Context, jobID string) (Meta, error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStRemoveTask: func(ctx context.Context, taskID string) (Meta, error) { + WdPoStRemoveJob: func(ctx context.Context, jobID string) (Meta, error) { panic("WorkerWdPoStAPI client unavailable") }, - WdPoStAllTasks: func(ctx context.Context) ([]*WdPoStTask, error) { + WdPoStAllJobs: func(ctx context.Context) ([]*WdPoStJob, error) { panic("WorkerWdPoStAPI client unavailable") }, } diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go index 730c5d578..b89ccb0a7 100644 --- a/damocles-manager/core/types_wdpost.go +++ b/damocles-manager/core/types_wdpost.go @@ -22,15 +22,15 @@ type WdPoStInput struct { Seed [32]byte } -type WdPoStTaskState string +type WdPoStJobState string const ( - WdPoStTaskReadyToRun WdPoStTaskState = "ready2run" - WdPoStTaskRunning WdPoStTaskState = "running" - WdPoStTaskFinished WdPoStTaskState = "finished" + WdPoStJobReadyToRun WdPoStJobState = "ready2run" + WdPoStJobRunning WdPoStJobState = "running" + WdPoStJobFinished WdPoStJobState = "finished" ) -type WdPoStTask struct { +type WdPoStJob struct { ID string `json:"Id"` State string DeadlineIdx uint64 @@ -46,8 +46,8 @@ type WdPoStTask struct { UpdatedAt uint64 } -func (t *WdPoStTask) Finished(maxTry uint32) bool { - if t.FinishedAt == 0 { +func (t *WdPoStJob) Finished(maxTry uint32) bool { + if t.State != string(WdPoStJobFinished) { return false } @@ -58,26 +58,49 @@ func (t *WdPoStTask) Finished(maxTry uint32) bool { return true } -type WdPoStAllocatedTask struct { +func (t *WdPoStJob) Succeed() bool { + if t.State != string(WdPoStJobFinished) { + return false + } + return t.Output != nil +} + +func (t *WdPoStJob) DisplayState() string { + switch WdPoStJobState(t.State) { + case WdPoStJobReadyToRun: + return "ReadyToRun" + case WdPoStJobRunning: + return "Running" + case WdPoStJobFinished: + if t.Succeed() { + return "Succeed" + } else { + return "Failed" + } + } + return t.State +} + +type WdPoStAllocatedJob struct { ID string `json:"Id"` Input WdPoStInput } -type AllocateWdPoStTaskSpec struct { +type AllocateWdPoStJobSpec struct { AllowedMiners []abi.ActorID AllowedProofTypes []abi.RegisteredPoStProof } -type WorkerWdPoStTaskManager interface { - All(ctx context.Context, filter func(*WdPoStTask) bool) ([]*WdPoStTask, error) - ListByTaskIDs(ctx context.Context, state WdPoStTaskState, taskIDs ...string) ([]*WdPoStTask, error) - Create(ctx context.Context, deadlineIdx uint64, input WdPoStInput) (*WdPoStTask, error) - AllocateTasks(ctx context.Context, spec AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*WdPoStAllocatedTask, err error) - Heartbeat(ctx context.Context, taskIDs []string, workerName string) error - Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error - MakeTasksDie(ctx context.Context, shouldDeadDur time.Duration, limit uint32) error - CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error - RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error - Reset(ctx context.Context, taskID string) error - Remove(ctx context.Context, taskID string) error +type WorkerWdPoStJobManager interface { + All(ctx context.Context, filter func(*WdPoStJob) bool) ([]*WdPoStJob, error) + ListByJobIDs(ctx context.Context, state WdPoStJobState, jobIDs ...string) ([]*WdPoStJob, error) + Create(ctx context.Context, deadlineIdx uint64, input WdPoStInput) (*WdPoStJob, error) + AllocateJobs(ctx context.Context, spec AllocateWdPoStJobSpec, num uint32, workerName string) (allocatedJobs []*WdPoStAllocatedJob, err error) + Heartbeat(ctx context.Context, jobIDs []string, workerName string) error + Finish(ctx context.Context, jobID string, output *stage.WindowPoStOutput, errorReason string) error + MakeJobsDie(ctx context.Context, shouldDeadDur time.Duration, limit uint32) error + CleanupExpiredJobs(ctx context.Context, jobLifetime time.Duration, limit uint32) error + RetryFailedJobs(ctx context.Context, maxTry, limit uint32) error + Reset(ctx context.Context, jobID string) error + Remove(ctx context.Context, jobID string) error } diff --git a/damocles-manager/dep/prover.go b/damocles-manager/dep/prover.go index 9279e6455..6bbd700c6 100644 --- a/damocles-manager/dep/prover.go +++ b/damocles-manager/dep/prover.go @@ -33,18 +33,12 @@ func WorkerProver() dix.Option { return dix.Options( dix.Override(new(WorkerProverStore), BuildWorkerProverStore), dix.Override(new(*proverworker.Config), proverworker.DefaultConfig), - dix.Override(new(core.WorkerWdPoStTaskManager), BuildWorkerWdPoStTaskManager), + dix.Override(new(core.WorkerWdPoStJobManager), BuildWorkerWdPoStJobManager), dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewWdPoStAPIImpl), dix.Override(new(core.Prover), BuildWorkerProver), ) } -func DisableWorkerProver() dix.Option { - return dix.Options( - dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewUnavailableWdPoStAPIImpl), - ) -} - func BuildExtProver(gctx GlobalContext, lc fx.Lifecycle, sectorTracker core.SectorTracker, cfg *modules.ProcessorConfig) (*ext.Prover, error) { p, err := ext.New(gctx, sectorTracker, cfg.WdPost, cfg.WinPost) if err != nil { @@ -98,11 +92,11 @@ func BuildWorkerProverStore(gctx GlobalContext, db UnderlyingDB) (WorkerProverSt return db.OpenCollection(gctx, "prover") } -func BuildWorkerProver(lc fx.Lifecycle, taskMgr core.WorkerWdPoStTaskManager, sectorTracker core.SectorTracker, config *proverworker.Config) (core.Prover, error) { - p := proverworker.NewProver(taskMgr, sectorTracker, config) +func BuildWorkerProver(lc fx.Lifecycle, jobMgr core.WorkerWdPoStJobManager, sectorTracker core.SectorTracker, config *proverworker.Config) (core.Prover, error) { + p := proverworker.NewProver(jobMgr, sectorTracker, config) lc.Append(fx.Hook{ OnStart: func(ctx context.Context) error { - p.StartJob(ctx) + p.Start(ctx) return nil }, }) @@ -110,10 +104,10 @@ func BuildWorkerProver(lc fx.Lifecycle, taskMgr core.WorkerWdPoStTaskManager, se return p, nil } -func BuildWorkerWdPoStTaskManager(kv WorkerProverStore) (core.WorkerWdPoStTaskManager, error) { +func BuildWorkerWdPoStJobManager(kv WorkerProverStore) (core.WorkerWdPoStJobManager, error) { wdpostKV, err := kvstore.NewWrappedKVStore([]byte("wdpost-"), kv) if err != nil { return nil, err } - return proverworker.NewKVTaskManager(*kvstore.NewKVExt(wdpostKV)), nil + return proverworker.NewKVJobManager(*kvstore.NewKVExt(wdpostKV)), nil } diff --git a/damocles-manager/modules/impl/prover/worker/config.go b/damocles-manager/modules/impl/prover/worker/config.go index c00a45176..98c77614d 100644 --- a/damocles-manager/modules/impl/prover/worker/config.go +++ b/damocles-manager/modules/impl/prover/worker/config.go @@ -3,19 +3,19 @@ package worker import "time" type Config struct { - RetryFailedTasksInterval time.Duration - TaskMaxTry uint32 - HeartbeatTimeout time.Duration - CleanupExpiredTasksJobInterval time.Duration - TaskLifetime time.Duration + RetryFailedJobsInterval time.Duration + JobMaxTry uint32 + HeartbeatTimeout time.Duration + CleanupExpiredJobsInterval time.Duration + JobLifetime time.Duration } func DefaultConfig() *Config { return &Config{ - RetryFailedTasksInterval: 10 * time.Second, - TaskMaxTry: 2, - HeartbeatTimeout: 15 * time.Second, - CleanupExpiredTasksJobInterval: 30 * time.Minute, - TaskLifetime: 25 * time.Hour, + RetryFailedJobsInterval: 10 * time.Second, + JobMaxTry: 2, + HeartbeatTimeout: 15 * time.Second, + CleanupExpiredJobsInterval: 30 * time.Minute, + JobLifetime: 25 * time.Hour, } } diff --git a/damocles-manager/modules/impl/prover/worker/job_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/job_mgr_kv.go new file mode 100644 index 000000000..9ffa39238 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/job_mgr_kv.go @@ -0,0 +1,425 @@ +package worker + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "math" + "sort" + "strings" + "time" + + "github.com/filecoin-project/go-state-types/abi" + "github.com/ipfs-force-community/damocles/damocles-manager/core" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" + "golang.org/x/exp/slices" +) + +func NewKVJobManager(kv kvstore.KVExt) core.WorkerWdPoStJobManager { + return &kvJobManager{ + kv: kv, + } +} + +type kvJobManager struct { + kv kvstore.KVExt +} + +// TODO(0x5459): Consider putting `txn` into context? +func (tm *kvJobManager) filter(ctx context.Context, txn kvstore.TxnExt, state core.WdPoStJobState, limit uint32, f func(*core.WdPoStJob) bool) (jobs []*core.WdPoStJob, err error) { + var it kvstore.Iter + it, err = txn.Scan([]byte(makeWdPoStPrefix(state))) + if err != nil { + return + } + defer it.Close() + for it.Next() && len(jobs) < int(limit) { + var job core.WdPoStJob + if err = it.View(ctx, kvstore.LoadJSON(&job)); err != nil { + return + } + if f(&job) { + jobs = append(jobs, &job) + } + } + return +} + +func (tm *kvJobManager) All(ctx context.Context, filter func(*core.WdPoStJob) bool) (jobs []*core.WdPoStJob, err error) { + jobs = make([]*core.WdPoStJob, 0) + err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + for _, state := range []core.WdPoStJobState{core.WdPoStJobReadyToRun, core.WdPoStJobRunning, core.WdPoStJobFinished} { + ts, err := tm.filter(ctx, txn, state, math.MaxUint32, filter) + if err != nil { + return err + } + jobs = append(jobs, ts...) + } + return err + }) + sort.Slice(jobs, func(i, j int) bool { + return jobs[i].CreatedAt > jobs[j].CreatedAt + }) + return +} + +func (tm *kvJobManager) ListByJobIDs(ctx context.Context, state core.WdPoStJobState, jobIDs ...string) ([]*core.WdPoStJob, error) { + jobs := make([]*core.WdPoStJob, 0, len(jobIDs)) + err := tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + for _, jobID := range jobIDs { + var job core.WdPoStJob + err := txn.Peek(kvstore.Key(makeWdPoStKey(state, jobID)), kvstore.LoadJSON(&job)) + if errors.Is(err, kvstore.ErrKeyNotFound) { + continue + } + if err != nil { + return err + } + jobs = append(jobs, &job) + } + return nil + }) + return jobs, err +} + +func (tm *kvJobManager) Create(ctx context.Context, deadlineIdx uint64, input core.WdPoStInput) (*core.WdPoStJob, error) { + var ( + jobID string + job *core.WdPoStJob + ) + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + rawInput, err := json.Marshal(input) + if err != nil { + return err + } + jobID = GenJobID(rawInput) + // check if job exists + _, err = txn.PeekAny( + kvstore.LoadJSON(job), + kvstore.Key(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), + kvstore.Key(makeWdPoStKey(core.WdPoStJobRunning, jobID)), + kvstore.Key(makeWdPoStKey(core.WdPoStJobFinished, jobID)), + ) + if err == nil { + // return if it is exists + return nil + } + if !errors.Is(err, kvstore.ErrKeyNotFound) { + return err + } + + now := time.Now().Unix() + job = &core.WdPoStJob{ + ID: jobID, + State: string(core.WdPoStJobReadyToRun), + DeadlineIdx: deadlineIdx, + Input: input, + Output: nil, + TryNum: 0, + ErrorReason: "", + WorkerName: "", + StartedAt: 0, + HeartbeatAt: 0, + FinishedAt: 0, + CreatedAt: uint64(now), + UpdatedAt: uint64(now), + } + return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), job) + }) + + if err == nil { + log.Infof("wdPoSt job created: %s", jobID) + } + return job, err +} + +func (tm *kvJobManager) AllocateJobs(ctx context.Context, spec core.AllocateWdPoStJobSpec, n uint32, workerName string) (allocatedJobs []*core.WdPoStAllocatedJob, err error) { + var readyToRun []*core.WdPoStJob + allocatedJobs = make([]*core.WdPoStAllocatedJob, 0) + err = tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + readyToRun, err = tm.filter(ctx, txn, core.WdPoStJobReadyToRun, n, func(t *core.WdPoStJob) bool { + if len(spec.AllowedMiners) > 0 && !slices.Contains(spec.AllowedMiners, t.Input.MinerID) { + return false + } + if len(spec.AllowedProofTypes) > 0 && !slices.ContainsFunc(spec.AllowedProofTypes, func(allowed abi.RegisteredPoStProof) bool { + return stage.ProofType2String(allowed) == t.Input.ProofType + }) { + return false + } + return true + }) + if err != nil { + return err + } + now := uint64(time.Now().Unix()) + for _, job := range readyToRun { + // Moving ready to run jobs to running jobs + if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStJobReadyToRun, job.ID))); err != nil { + return err + } + job.State = string(core.WdPoStJobRunning) + job.TryNum++ + job.StartedAt = now + job.WorkerName = workerName + job.HeartbeatAt = now + job.UpdatedAt = now + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobRunning, job.ID)), job); err != nil { + return err + } + allocatedJobs = append(allocatedJobs, &core.WdPoStAllocatedJob{ + ID: job.ID, + Input: job.Input, + }) + } + return nil + }) + + if err == nil { + for _, job := range readyToRun { + log.Infof("allocated wdPoSt job: %s; try_num: %d", job.ID, job.TryNum) + } + } + return +} + +func (tm *kvJobManager) Heartbeat(ctx context.Context, jobIDs []string, workerName string) error { + now := uint64(time.Now().Unix()) + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + for _, jobID := range jobIDs { + var job core.WdPoStJob + if err := txn.Peek([]byte(makeWdPoStKey(core.WdPoStJobRunning, jobID)), kvstore.LoadJSON(&job)); err != nil { + return err + } + if job.StartedAt == 0 { + job.StartedAt = now + } + job.HeartbeatAt = now + job.WorkerName = workerName + job.UpdatedAt = now + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobRunning, jobID)), &job); err != nil { + return err + } + } + return nil + }) + if err == nil { + log.With("worker_name", workerName).Debug("wdPoSt jobs heartbeat", jobIDs) + } + return err +} + +func (tm *kvJobManager) Finish(ctx context.Context, jobID string, output *stage.WindowPoStOutput, errorReason string) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + runningKey := []byte(makeWdPoStKey(core.WdPoStJobRunning, jobID)) + var job core.WdPoStJob + if err := txn.Peek(runningKey, kvstore.LoadJSON(&job)); err != nil { + return err + } + if err := txn.Del(runningKey); err != nil { + return err + } + now := uint64(time.Now().Unix()) + job.State = string(core.WdPoStJobFinished) + job.Output = output + job.ErrorReason = errorReason + job.FinishedAt = now + job.UpdatedAt = now + return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobFinished, jobID)), &job) + }) + + if err == nil { + if len(errorReason) == 0 { + log.Infof("wdPoSt job succeeded: %s", jobID) + } else { + log.Warnf("wdPoSt job failed: %s; error_reason: %s", jobID, errorReason) + } + } + return err +} + +func (tm *kvJobManager) MakeJobsDie(ctx context.Context, heartbeatTimeout time.Duration, limit uint32) error { + var shouldDead []*core.WdPoStJob + shouldDeadTime := time.Now().Add(-heartbeatTimeout) + + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + var err error + shouldDead, err = tm.filter(ctx, txn, core.WdPoStJobRunning, limit, func(t *core.WdPoStJob) bool { + return t.HeartbeatAt > 0 && time.Unix(int64(t.HeartbeatAt), 0).Before(shouldDeadTime) + }) + if err != nil { + return err + } + now := uint64(time.Now().Unix()) + for _, job := range shouldDead { + if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStJobRunning, job.ID))); err != nil { + return err + } + job.State = string(core.WdPoStJobFinished) + job.FinishedAt = now + job.Output = nil + job.ErrorReason = "heartbeat timeout" + job.UpdatedAt = now + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobFinished, job.ID)), job); err != nil { + return err + } + } + return nil + }) + + if err == nil { + for _, job := range shouldDead { + log.Infof("make wdPoSt job die: %s; heartbeat_at: %s", job.ID, time.Unix(int64(job.HeartbeatAt), 0).Format(time.RFC3339)) + } + } + + return err +} + +func (tm *kvJobManager) CleanupExpiredJobs(ctx context.Context, jobLifetime time.Duration, limit uint32) error { + var shouldClean []*core.WdPoStJob + shouldCleanTime := time.Now().Add(-jobLifetime) + + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + var err error + shouldClean, err = tm.filter(ctx, txn, core.WdPoStJobFinished, limit, func(t *core.WdPoStJob) bool { + return time.Unix(int64(t.CreatedAt), 0).Before(shouldCleanTime) + }) + if err != nil { + return err + } + for _, job := range shouldClean { + if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStJobFinished, job.ID))); err != nil { + return err + } + } + return nil + }) + + if err == nil { + for _, job := range shouldClean { + log.Infof("cleanup expired wdPoSt job: %s; job: %#v", job.ID, job) + } + } + return err +} + +func (tm *kvJobManager) RetryFailedJobs(ctx context.Context, maxTry, limit uint32) error { + var shouldRetry []*core.WdPoStJob + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + var err error + shouldRetry, err = tm.filter(ctx, txn, core.WdPoStJobFinished, limit, func(t *core.WdPoStJob) bool { + return len(t.ErrorReason) != 0 && t.TryNum < maxTry + }) + if err != nil { + return err + } + now := uint64(time.Now().Unix()) + for _, job := range shouldRetry { + err := txn.Del([]byte(makeWdPoStKey(core.WdPoStJobFinished, job.ID))) + if err != nil { + return err + } + job.ErrorReason = "" + job.State = string(core.WdPoStJobReadyToRun) + job.Output = nil + job.StartedAt = 0 + job.FinishedAt = 0 + job.UpdatedAt = now + if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobReadyToRun, job.ID)), job); err != nil { + return err + } + } + return nil + }) + + if err == nil { + for _, job := range shouldRetry { + log.Debugf("retry wdPoSt job: %s; try_num: %d, error_reason: %s", job.ID, job.TryNum, job.ErrorReason) + } + } + + return err +} + +func (tm *kvJobManager) Reset(ctx context.Context, jobID string) error { + var job core.WdPoStJob + now := uint64(time.Now().Unix()) + + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + key, err := txn.PeekAny( + kvstore.LoadJSON(&job), + kvstore.Key(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), + kvstore.Key(makeWdPoStKey(core.WdPoStJobRunning, jobID)), + kvstore.Key(makeWdPoStKey(core.WdPoStJobFinished, jobID)), + ) + if err != nil { + return fmt.Errorf("load job from db: %w. jobID: %s", err, jobID) + } + + job.State = string(core.WdPoStJobReadyToRun) + job.CreatedAt = now + job.StartedAt = 0 + job.TryNum = 0 + job.Output = nil + job.ErrorReason = "" + job.FinishedAt = 0 + job.HeartbeatAt = 0 + job.WorkerName = "" + job.UpdatedAt = now + + if err := txn.Del(key); err != nil { + return err + } + return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), &job) + }) + + if err == nil { + log.Infof("job is reset: %s", jobID) + } + + return err +} + +func (tm *kvJobManager) Remove(ctx context.Context, jobID string) error { + err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { + key, err := txn.PeekAny( + kvstore.NilF, + kvstore.Key(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), + kvstore.Key(makeWdPoStKey(core.WdPoStJobRunning, jobID)), + kvstore.Key(makeWdPoStKey(core.WdPoStJobFinished, jobID)), + ) + if errors.Is(err, kvstore.ErrKeyNotFound) { + return nil + } + if err != nil { + return fmt.Errorf("load job from db: %w. jobID: %s", err, jobID) + } + return txn.Del(key) + }) + + if err == nil { + log.Infof("job removed: %s", jobID) + } + + return err +} + +const ( + prefixJobIDdelimiter = ":" +) + +func makeWdPoStPrefix(state core.WdPoStJobState) string { + return string(state) +} + +func makeWdPoStKey(state core.WdPoStJobState, jobID string) string { + return fmt.Sprintf("%s%s%s", makeWdPoStPrefix(state), prefixJobIDdelimiter, jobID) +} + +//lint:ignore U1000 Ignore unused function +func splitKey(key string) (state core.WdPoStJobState, jobID string) { + x := strings.SplitN(key, prefixJobIDdelimiter, 2) + return core.WdPoStJobState(x[0]), x[1] +} diff --git a/damocles-manager/modules/impl/prover/worker/job_mgr_kv_test.go b/damocles-manager/modules/impl/prover/worker/job_mgr_kv_test.go new file mode 100644 index 000000000..dff37b2b1 --- /dev/null +++ b/damocles-manager/modules/impl/prover/worker/job_mgr_kv_test.go @@ -0,0 +1,18 @@ +package worker + +import ( + "testing" + + "github.com/ipfs-force-community/damocles/damocles-manager/core" + "github.com/stretchr/testify/require" +) + +func TestSplitKey(t *testing.T) { + for _, jobID := range []string{"normal123", "with-", "-", "-with", "wi-th", "with:xxx", ":xxx", ":"} { + for _, state := range []core.WdPoStJobState{core.WdPoStJobReadyToRun, core.WdPoStJobRunning, core.WdPoStJobFinished} { + actualState, actualJobID := splitKey(makeWdPoStKey(state, jobID)) + require.Equalf(t, state, actualState, "test state for \"state: `%s`; jobID: `%s`\"", state, jobID) + require.Equalf(t, jobID, actualJobID, "test jobID for \"state: `%s`; jobID: `%s`\"", state, jobID) + } + } +} diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go index 3e1c6539d..77099782a 100644 --- a/damocles-manager/modules/impl/prover/worker/prover.go +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -22,101 +22,102 @@ var log = logging.New("worker prover") var _ core.Prover = (*WorkerProver)(nil) -func GenTaskID(rawInput []byte) string { +func GenJobID(rawInput []byte) string { b := make([]byte, 8) binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) return base58.Encode(b) } +type R struct { + output *stage.WindowPoStOutput + err string +} + type WorkerProver struct { - taskMgr core.WorkerWdPoStTaskManager + jobMgr core.WorkerWdPoStJobManager sectorTracker core.SectorTracker localProver core.Prover - inflightTasks map[string][]chan<- struct { - output *stage.WindowPoStOutput - err string - } - inflightTasksLock *sync.Mutex - config *Config + inflightJobs map[string][]chan<- R + inflightJobsLock *sync.Mutex + config *Config } -func NewProver(taskMgr core.WorkerWdPoStTaskManager, sectorTracker core.SectorTracker, config *Config) *WorkerProver { +func NewProver(jobMgr core.WorkerWdPoStJobManager, sectorTracker core.SectorTracker, config *Config) *WorkerProver { return &WorkerProver{ - taskMgr: taskMgr, - sectorTracker: sectorTracker, - localProver: prover.NewProdProver(sectorTracker), - inflightTasks: make(map[string][]chan<- struct { - output *stage.WindowPoStOutput - err string - }), - inflightTasksLock: &sync.Mutex{}, - config: config, + jobMgr: jobMgr, + sectorTracker: sectorTracker, + localProver: prover.NewProdProver(sectorTracker), + inflightJobs: make(map[string][]chan<- R), + inflightJobsLock: &sync.Mutex{}, + config: config, } } -func (p *WorkerProver) StartJob(ctx context.Context) { - go p.runNotifyTaskDoneJob(ctx) - go p.runRetryFailedTasksJob(ctx) - go p.runCleanupExpiredTasksJob(ctx) +func (p *WorkerProver) Start(ctx context.Context) { + go p.runNotifyJobDone(ctx) + go p.runRetryFailedJobs(ctx) + go p.runCleanupExpiredJobs(ctx) } -func (p *WorkerProver) runNotifyTaskDoneJob(ctx context.Context) { +func (p *WorkerProver) runNotifyJobDone(ctx context.Context) { ticker := time.NewTicker(3 * time.Second) defer ticker.Stop() for { select { case <-ctx.Done(): + log.Info("stop notifyJobDone") return case <-ticker.C: - p.inflightTasksLock.Lock() - inflightTaskIDs := make([]string, 0, len(p.inflightTasks)) - for taskID := range p.inflightTasks { - inflightTaskIDs = append(inflightTaskIDs, taskID) + p.inflightJobsLock.Lock() + inflightJobIDs := make([]string, 0, len(p.inflightJobs)) + for jobID := range p.inflightJobs { + inflightJobIDs = append(inflightJobIDs, jobID) } - p.inflightTasksLock.Unlock() + p.inflightJobsLock.Unlock() - finishedTasks, err := p.taskMgr.ListByTaskIDs(ctx, core.WdPoStTaskFinished, inflightTaskIDs...) + finishedJobs, err := p.jobMgr.ListByJobIDs(ctx, core.WdPoStJobFinished, inflightJobIDs...) if err != nil { - log.Errorf("failed to list tasks: %s", err) + log.Errorf("failed to list jobs: %s", err) } - p.inflightTasksLock.Lock() - for _, task := range finishedTasks { - chs, ok := p.inflightTasks[task.ID] + p.inflightJobsLock.Lock() + for _, job := range finishedJobs { + chs, ok := p.inflightJobs[job.ID] if !ok { continue } - if !task.Finished(p.config.TaskMaxTry) { + if !job.Finished(p.config.JobMaxTry) { continue } + delete(p.inflightJobs, job.ID) + for _, ch := range chs { - ch <- struct { - output *stage.WindowPoStOutput - err string - }{ - output: task.Output, - err: task.ErrorReason, + ch <- R{ + output: job.Output, + err: job.ErrorReason, } + close(ch) } } - p.inflightTasksLock.Unlock() + p.inflightJobsLock.Unlock() } } } -func (p *WorkerProver) runRetryFailedTasksJob(ctx context.Context) { - ticker := time.NewTicker(p.config.RetryFailedTasksInterval) +func (p *WorkerProver) runRetryFailedJobs(ctx context.Context) { + ticker := time.NewTicker(p.config.RetryFailedJobsInterval) defer ticker.Stop() for { - if err := p.taskMgr.MakeTasksDie(ctx, p.config.HeartbeatTimeout, 128); err != nil { - log.Errorf("failed to make tasks die: %s", err) + if err := p.jobMgr.MakeJobsDie(ctx, p.config.HeartbeatTimeout, 128); err != nil { + log.Errorf("failed to make jobs die: %s", err) } - if err := p.taskMgr.RetryFailedTasks(ctx, p.config.TaskMaxTry, 128); err != nil { - log.Errorf("failed to retry failed tasks: %s", err) + if err := p.jobMgr.RetryFailedJobs(ctx, p.config.JobMaxTry, 128); err != nil { + log.Errorf("failed to retry failed jobs: %s", err) } select { case <-ctx.Done(): + log.Info("stop retryFailedJobs") return case <-ticker.C: continue @@ -124,14 +125,15 @@ func (p *WorkerProver) runRetryFailedTasksJob(ctx context.Context) { } } -func (p *WorkerProver) runCleanupExpiredTasksJob(ctx context.Context) { - ticker := time.NewTicker(p.config.CleanupExpiredTasksJobInterval) +func (p *WorkerProver) runCleanupExpiredJobs(ctx context.Context) { + ticker := time.NewTicker(p.config.CleanupExpiredJobsInterval) for { - if err := p.taskMgr.CleanupExpiredTasks(ctx, p.config.TaskLifetime, 128); err != nil { - log.Errorf("failed to cleanup expired tasks: %s", err) + if err := p.jobMgr.CleanupExpiredJobs(ctx, p.config.JobLifetime, 128); err != nil { + log.Errorf("failed to cleanup expired jobs: %s", err) } select { case <-ctx.Done(): + log.Info("stop cleanupExpiredJobs") return case <-ticker.C: continue @@ -171,25 +173,29 @@ func (p *WorkerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint6 } copy(input.Seed[:], randomness[:]) - task, err := p.taskMgr.Create(ctx, deadlineIdx, input) + job, err := p.jobMgr.Create(ctx, deadlineIdx, input) if err != nil { - return nil, nil, fmt.Errorf("create wdPoSt task: %w", err) + return nil, nil, fmt.Errorf("create wdPoSt job: %w", err) } - ch := make(chan struct { - output *stage.WindowPoStOutput - err string - }, 1) + ch := make(chan R, 1) - p.inflightTasksLock.Lock() - p.inflightTasks[task.ID] = append(p.inflightTasks[task.ID], ch) - p.inflightTasksLock.Unlock() + p.inflightJobsLock.Lock() + p.inflightJobs[job.ID] = append(p.inflightJobs[job.ID], ch) + p.inflightJobsLock.Unlock() - result, ok := <-ch - - if !ok { - return nil, nil, fmt.Errorf("wdPoSt result channel was closed unexpectedly") + var result R + select { + case <-ctx.Done(): + err = fmt.Errorf("failed to generate window post before context cancellation: %w", ctx.Err()) + return + case res, ok := <-ch: + if !ok { + return nil, nil, fmt.Errorf("wdPoSt result channel was closed unexpectedly") + } + result = res } + if result.err != "" { return nil, nil, fmt.Errorf("error from worker: %s", result.err) } diff --git a/damocles-manager/modules/impl/prover/worker/rpc.go b/damocles-manager/modules/impl/prover/worker/rpc.go index d1ad27a18..0c59172aa 100644 --- a/damocles-manager/modules/impl/prover/worker/rpc.go +++ b/damocles-manager/modules/impl/prover/worker/rpc.go @@ -2,77 +2,54 @@ package worker import ( "context" + "errors" "fmt" "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" + "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" ) -func NewWdPoStAPIImpl(taskMgr core.WorkerWdPoStTaskManager, config *Config) core.WorkerWdPoStAPI { +func NewWdPoStAPIImpl(jobMgr core.WorkerWdPoStJobManager, config *Config) core.WorkerWdPoStAPI { return &WdPoStAPIImpl{ - taskMgr: taskMgr, - config: config, + jobMgr: jobMgr, + config: config, } } type WdPoStAPIImpl struct { - taskMgr core.WorkerWdPoStTaskManager - config *Config + jobMgr core.WorkerWdPoStJobManager + config *Config } -func (api WdPoStAPIImpl) WdPoStHeartbeatTasks(ctx context.Context, runningTaskIDs []string, workerName string) (core.Meta, error) { - return nil, api.taskMgr.Heartbeat(ctx, runningTaskIDs, workerName) +func (api WdPoStAPIImpl) WdPoStHeartbeatJobs(ctx context.Context, runningJobIDs []string, workerName string) (core.Meta, error) { + return nil, api.jobMgr.Heartbeat(ctx, runningJobIDs, workerName) } -func (api WdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*core.WdPoStAllocatedTask, err error) { - return api.taskMgr.AllocateTasks(ctx, spec, num, workerName) +func (api WdPoStAPIImpl) WdPoStAllocateJobs(ctx context.Context, spec core.AllocateWdPoStJobSpec, num uint32, workerName string) (allocatedJobs []*core.WdPoStAllocatedJob, err error) { + return api.jobMgr.AllocateJobs(ctx, spec, num, workerName) } -func (api WdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (core.Meta, error) { - return nil, api.taskMgr.Finish(ctx, taskID, output, errorReason) +func (api WdPoStAPIImpl) WdPoStFinishJob(ctx context.Context, jobID string, output *stage.WindowPoStOutput, errorReason string) (core.Meta, error) { + return nil, api.jobMgr.Finish(ctx, jobID, output, errorReason) } -func (api WdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) (core.Meta, error) { - // TODO(0x5459): return a friendlier error if taskID not exists - return nil, api.taskMgr.Reset(ctx, taskID) -} - -func (api WdPoStAPIImpl) WdPoStRemoveTask(ctx context.Context, taskID string) (core.Meta, error) { - // TODO(0x5459): return a friendlier error if taskID not exists - return nil, api.taskMgr.Remove(ctx, taskID) -} - -func (api WdPoStAPIImpl) WdPoStAllTasks(ctx context.Context) ([]*core.WdPoStTask, error) { - return api.taskMgr.All(ctx, func(_ *core.WdPoStTask) bool { return true }) -} - -func NewUnavailableWdPoStAPIImpl(taskMgr core.WorkerWdPoStTaskManager) core.WorkerWdPoStAPI { - return &UnavailableWdPoStAPIImpl{} -} - -// TODO(0x5459): UnavailableWdPoStAPIImpl should be automatically generated -type UnavailableWdPoStAPIImpl struct{} - -func (UnavailableWdPoStAPIImpl) WdPoStHeartbeatTasks(ctx context.Context, runningTaskIDs []string, workerName string) (core.Meta, error) { - return nil, fmt.Errorf("WdPoStAPI unavailable") -} - -func (UnavailableWdPoStAPIImpl) WdPoStAllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, num uint32, workerName string) (allocatedTasks []*core.WdPoStAllocatedTask, err error) { - return nil, fmt.Errorf("WdPoStAPI unavailable") -} - -func (UnavailableWdPoStAPIImpl) WdPoStFinishTask(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) (core.Meta, error) { - return nil, fmt.Errorf("WdPoStAPI unavailable") -} - -func (UnavailableWdPoStAPIImpl) WdPoStResetTask(ctx context.Context, taskID string) (core.Meta, error) { - return nil, fmt.Errorf("WdPoStAPI unavailable") +func (api WdPoStAPIImpl) WdPoStResetJob(ctx context.Context, jobID string) (core.Meta, error) { + err := api.jobMgr.Reset(ctx, jobID) + if errors.Is(err, kvstore.ErrKeyNotFound) { + return nil, fmt.Errorf("job '%s' does not exist", jobID) + } + return nil, err } -func (UnavailableWdPoStAPIImpl) WdPoStRemoveTask(ctx context.Context, taskID string) (core.Meta, error) { - return nil, fmt.Errorf("WdPoStAPI unavailable") +func (api WdPoStAPIImpl) WdPoStRemoveJob(ctx context.Context, jobID string) (core.Meta, error) { + err := api.jobMgr.Remove(ctx, jobID) + if errors.Is(err, kvstore.ErrKeyNotFound) { + return nil, fmt.Errorf("job '%s' does not exist", jobID) + } + return nil, err } -func (UnavailableWdPoStAPIImpl) WdPoStAllTasks(ctx context.Context) ([]*core.WdPoStTask, error) { - return nil, fmt.Errorf("WdPoStAPI unavailable") +func (api WdPoStAPIImpl) WdPoStAllJobs(ctx context.Context) ([]*core.WdPoStJob, error) { + return api.jobMgr.All(ctx, func(_ *core.WdPoStJob) bool { return true }) } diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go deleted file mode 100644 index 4d294c3c0..000000000 --- a/damocles-manager/modules/impl/prover/worker/task_mgr_kv.go +++ /dev/null @@ -1,421 +0,0 @@ -package worker - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "math" - "sort" - "strings" - "time" - - "github.com/filecoin-project/go-state-types/abi" - "github.com/ipfs-force-community/damocles/damocles-manager/core" - "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" - "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" - "golang.org/x/exp/slices" -) - -func NewKVTaskManager(kv kvstore.KVExt) core.WorkerWdPoStTaskManager { - return &kvTaskManager{ - kv: kv, - } -} - -type kvTaskManager struct { - kv kvstore.KVExt -} - -// TODO(0x5459): Consider putting `txn` into context? -func (tm *kvTaskManager) filter(ctx context.Context, txn kvstore.TxnExt, state core.WdPoStTaskState, limit uint32, f func(*core.WdPoStTask) bool) (tasks []*core.WdPoStTask, err error) { - var it kvstore.Iter - it, err = txn.Scan([]byte(makeWdPoStPrefix(state))) - if err != nil { - return - } - defer it.Close() - for it.Next() && len(tasks) < int(limit) { - var task core.WdPoStTask - if err = it.View(ctx, kvstore.LoadJSON(&task)); err != nil { - return - } - if f(&task) { - tasks = append(tasks, &task) - } - } - return -} - -func (tm *kvTaskManager) All(ctx context.Context, filter func(*core.WdPoStTask) bool) (tasks []*core.WdPoStTask, err error) { - tasks = make([]*core.WdPoStTask, 0) - err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - for _, state := range []core.WdPoStTaskState{core.WdPoStTaskReadyToRun, core.WdPoStTaskRunning, core.WdPoStTaskFinished} { - ts, err := tm.filter(ctx, txn, state, math.MaxUint32, filter) - if err != nil { - return err - } - tasks = append(tasks, ts...) - } - return err - }) - sort.Slice(tasks, func(i, j int) bool { - return tasks[i].CreatedAt > tasks[j].CreatedAt - }) - return -} - -func (tm *kvTaskManager) ListByTaskIDs(ctx context.Context, state core.WdPoStTaskState, taskIDs ...string) ([]*core.WdPoStTask, error) { - tasks := make([]*core.WdPoStTask, 0, len(taskIDs)) - err := tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - for _, taskID := range taskIDs { - var task core.WdPoStTask - err := txn.Peek(kvstore.Key(makeWdPoStKey(state, taskID)), kvstore.LoadJSON(&task)) - if errors.Is(err, kvstore.ErrKeyNotFound) { - continue - } - if err != nil { - return err - } - tasks = append(tasks, &task) - } - return nil - }) - return tasks, err -} - -func (tm *kvTaskManager) Create(ctx context.Context, deadlineIdx uint64, input core.WdPoStInput) (*core.WdPoStTask, error) { - var ( - taskID string - task *core.WdPoStTask - ) - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - rawInput, err := json.Marshal(input) - if err != nil { - return err - } - taskID = GenTaskID(rawInput) - // check if task exists - _, err = txn.PeekAny( - kvstore.LoadJSON(task), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), - ) - if err == nil { - // return if it is exists - return nil - } - if !errors.Is(err, kvstore.ErrKeyNotFound) { - return err - } - - now := time.Now().Unix() - task = &core.WdPoStTask{ - ID: taskID, - State: string(core.WdPoStTaskReadyToRun), - DeadlineIdx: deadlineIdx, - Input: input, - Output: nil, - TryNum: 0, - ErrorReason: "", - WorkerName: "", - StartedAt: 0, - HeartbeatAt: 0, - FinishedAt: 0, - CreatedAt: uint64(now), - UpdatedAt: uint64(now), - } - return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), task) - }) - - if err == nil { - log.Infof("wdPoSt task created: %s", taskID) - } - return task, err -} - -func (tm *kvTaskManager) AllocateTasks(ctx context.Context, spec core.AllocateWdPoStTaskSpec, n uint32, workerName string) (allocatedTasks []*core.WdPoStAllocatedTask, err error) { - var readyToRun []*core.WdPoStTask - allocatedTasks = make([]*core.WdPoStAllocatedTask, 0) - err = tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - readyToRun, err = tm.filter(ctx, txn, core.WdPoStTaskReadyToRun, n, func(t *core.WdPoStTask) bool { - if len(spec.AllowedMiners) > 0 && !slices.Contains(spec.AllowedMiners, t.Input.MinerID) { - return false - } - if len(spec.AllowedProofTypes) > 0 && !slices.ContainsFunc(spec.AllowedProofTypes, func(allowed abi.RegisteredPoStProof) bool { - return stage.ProofType2String(allowed) == t.Input.ProofType - }) { - return false - } - return true - }) - if err != nil { - return err - } - now := uint64(time.Now().Unix()) - for _, task := range readyToRun { - // Moving ready to run tasks to running tasks - if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, task.ID))); err != nil { - return err - } - task.State = string(core.WdPoStTaskRunning) - task.TryNum++ - task.StartedAt = now - task.WorkerName = workerName - task.HeartbeatAt = now - task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskRunning, task.ID)), task); err != nil { - return err - } - allocatedTasks = append(allocatedTasks, &core.WdPoStAllocatedTask{ - ID: task.ID, - Input: task.Input, - }) - } - return nil - }) - - if err == nil { - for _, task := range readyToRun { - log.Infof("allocated wdPoSt task: %s; try_num: %d", task.ID, task.TryNum) - } - } - return -} - -func (tm *kvTaskManager) Heartbeat(ctx context.Context, taskIDs []string, workerName string) error { - now := uint64(time.Now().Unix()) - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - for _, taskID := range taskIDs { - var task core.WdPoStTask - if err := txn.Peek([]byte(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), kvstore.LoadJSON(&task)); err != nil { - return err - } - if task.StartedAt == 0 { - task.StartedAt = now - } - task.HeartbeatAt = now - task.WorkerName = workerName - task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), &task); err != nil { - return err - } - } - return nil - }) - if err == nil { - log.With("worker_name", workerName).Debug("wdPoSt tasks heartbeat", taskIDs) - } - return err -} - -func (tm *kvTaskManager) Finish(ctx context.Context, taskID string, output *stage.WindowPoStOutput, errorReason string) error { - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - runningKey := []byte(makeWdPoStKey(core.WdPoStTaskRunning, taskID)) - var task core.WdPoStTask - if err := txn.Peek(runningKey, kvstore.LoadJSON(&task)); err != nil { - return err - } - if err := txn.Del(runningKey); err != nil { - return err - } - now := uint64(time.Now().Unix()) - task.State = string(core.WdPoStTaskFinished) - task.Output = output - task.ErrorReason = errorReason - task.FinishedAt = now - task.UpdatedAt = now - return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), &task) - }) - - if err == nil { - if len(errorReason) == 0 { - log.Infof("wdPoSt task succeeded: %s", taskID) - } else { - log.Warnf("wdPoSt task failed: %s; error_reason: %s", taskID, errorReason) - } - } - return err -} - -func (tm *kvTaskManager) MakeTasksDie(ctx context.Context, heartbeatTimeout time.Duration, limit uint32) error { - var shouldDead []*core.WdPoStTask - shouldDeadTime := time.Now().Add(-heartbeatTimeout) - - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - var err error - shouldDead, err = tm.filter(ctx, txn, core.WdPoStTaskRunning, limit, func(t *core.WdPoStTask) bool { - return t.HeartbeatAt > 0 && time.Unix(int64(t.HeartbeatAt), 0).Before(shouldDeadTime) - }) - if err != nil { - return err - } - now := uint64(time.Now().Unix()) - for _, task := range shouldDead { - if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskRunning, task.ID))); err != nil { - return err - } - task.State = string(core.WdPoStTaskFinished) - task.FinishedAt = now - task.Output = nil - task.ErrorReason = "heartbeat timeout" - task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskFinished, task.ID)), task); err != nil { - return err - } - } - return nil - }) - - if err == nil { - for _, task := range shouldDead { - log.Infof("make wdPoSt task die: %s; heartbeat_at: %s", task.ID, time.Unix(int64(task.HeartbeatAt), 0).Format(time.RFC3339)) - } - } - - return err -} - -func (tm *kvTaskManager) CleanupExpiredTasks(ctx context.Context, taskLifetime time.Duration, limit uint32) error { - var shouldClean []*core.WdPoStTask - shouldCleanTime := time.Now().Add(-taskLifetime) - - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - var err error - shouldClean, err = tm.filter(ctx, txn, core.WdPoStTaskFinished, limit, func(t *core.WdPoStTask) bool { - return time.Unix(int64(t.CreatedAt), 0).Before(shouldCleanTime) - }) - if err != nil { - return err - } - for _, task := range shouldClean { - if err := txn.Del([]byte(makeWdPoStKey(core.WdPoStTaskFinished, task.ID))); err != nil { - return err - } - } - return nil - }) - - if err == nil { - for _, task := range shouldClean { - log.Infof("cleanup expired wdPoSt task: %s; created_at: %s", task.ID, time.Unix(int64(task.CreatedAt), 0).Format(time.RFC3339)) - } - } - return err -} - -func (tm *kvTaskManager) RetryFailedTasks(ctx context.Context, maxTry, limit uint32) error { - var shouldRetry []*core.WdPoStTask - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - var err error - shouldRetry, err = tm.filter(ctx, txn, core.WdPoStTaskFinished, limit, func(t *core.WdPoStTask) bool { - return len(t.ErrorReason) != 0 && t.TryNum < maxTry - }) - if err != nil { - return err - } - now := uint64(time.Now().Unix()) - for _, task := range shouldRetry { - task.ErrorReason = "" - task.State = string(core.WdPoStTaskReadyToRun) - task.Output = nil - task.StartedAt = 0 - task.FinishedAt = 0 - task.UpdatedAt = now - if err := txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, task.ID)), task); err != nil { - return err - } - } - return nil - }) - - if err == nil { - for _, task := range shouldRetry { - log.Debugf("retry wdPoSt task: %s; try_num: %d, error_reason: %s", task.ID, task.TryNum, task.ErrorReason) - } - } - - return err -} - -func (tm *kvTaskManager) Reset(ctx context.Context, taskID string) error { - var task core.WdPoStTask - now := uint64(time.Now().Unix()) - - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - key, err := txn.PeekAny( - kvstore.LoadJSON(&task), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), - ) - if err != nil { - return fmt.Errorf("load task from db: %w. taskID: %s", err, taskID) - } - - task.State = string(core.WdPoStTaskReadyToRun) - task.CreatedAt = now - task.StartedAt = 0 - task.TryNum = 0 - task.Output = nil - task.ErrorReason = "" - task.FinishedAt = 0 - task.HeartbeatAt = 0 - task.WorkerName = "" - task.UpdatedAt = now - - if err := txn.Del(key); err != nil { - return err - } - return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), &task) - }) - - if err == nil { - log.Infof("task is reset: %s", taskID) - } - - return err -} - -func (tm *kvTaskManager) Remove(ctx context.Context, taskID string) error { - err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - key, err := txn.PeekAny( - kvstore.NilF, - kvstore.Key(makeWdPoStKey(core.WdPoStTaskReadyToRun, taskID)), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskRunning, taskID)), - kvstore.Key(makeWdPoStKey(core.WdPoStTaskFinished, taskID)), - ) - if errors.Is(err, kvstore.ErrKeyNotFound) { - return nil - } - if err != nil { - return fmt.Errorf("load task from db: %w. taskID: %s", err, taskID) - } - return txn.Del(key) - }) - - if err == nil { - log.Infof("task removed: %s", taskID) - } - - return err -} - -const ( - prefixTaskIDdelimiter = ":" -) - -func makeWdPoStPrefix(state core.WdPoStTaskState) string { - return string(state) -} - -func makeWdPoStKey(state core.WdPoStTaskState, taskID string) string { - return fmt.Sprintf("%s%s%s", makeWdPoStPrefix(state), prefixTaskIDdelimiter, taskID) -} - -//lint:ignore U1000 Ignore unused function -func splitKey(key string) (state core.WdPoStTaskState, taskID string) { - x := strings.SplitN(key, prefixTaskIDdelimiter, 2) - return core.WdPoStTaskState(x[0]), x[1] -} diff --git a/damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go b/damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go deleted file mode 100644 index efdbaaa43..000000000 --- a/damocles-manager/modules/impl/prover/worker/task_mgr_kv_test.go +++ /dev/null @@ -1,18 +0,0 @@ -package worker - -import ( - "testing" - - "github.com/ipfs-force-community/damocles/damocles-manager/core" - "github.com/stretchr/testify/require" -) - -func TestSplitKey(t *testing.T) { - for _, taskID := range []string{"normal123", "with-", "-", "-with", "wi-th", "with:xxx", ":xxx", ":"} { - for _, state := range []core.WdPoStTaskState{core.WdPoStTaskReadyToRun, core.WdPoStTaskRunning, core.WdPoStTaskFinished} { - actualState, actualTaskID := splitKey(makeWdPoStKey(state, taskID)) - require.Equalf(t, state, actualState, "test state for \"state: `%s`; taskID: `%s`\"", state, taskID) - require.Equalf(t, taskID, actualTaskID, "test taskID for \"state: `%s`; taskID: `%s`\"", state, taskID) - } - } -} diff --git a/damocles-worker/Cargo.lock b/damocles-worker/Cargo.lock index 3e3527167..265ad517c 100644 --- a/damocles-worker/Cargo.lock +++ b/damocles-worker/Cargo.lock @@ -2066,8 +2066,7 @@ dependencies = [ [[package]] name = "jsonrpc-client-transports" version = "18.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b99d4207e2a04fb4581746903c2bb7eb376f88de9c699d0f3e10feeac0cd3a" +source = "git+https://github.com/ipfs-force-community/jsonrpc.git?branch=feat/0x5459/v18-do-not-check-http-status#3b2b3be5d92ce6c23529fa5aadc4a716c8964ed7" dependencies = [ "derive_more", "futures 0.3.28", @@ -2085,8 +2084,7 @@ dependencies = [ [[package]] name = "jsonrpc-core" version = "18.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14f7f76aef2d054868398427f6c54943cf3d1caa9a7ec7d0c38d69df97a965eb" +source = "git+https://github.com/ipfs-force-community/jsonrpc.git?branch=feat/0x5459/v18-do-not-check-http-status#3b2b3be5d92ce6c23529fa5aadc4a716c8964ed7" dependencies = [ "futures 0.3.28", "futures-executor", @@ -2100,8 +2098,7 @@ dependencies = [ [[package]] name = "jsonrpc-core-client" version = "18.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b51da17abecbdab3e3d4f26b01c5ec075e88d3abe3ab3b05dc9aa69392764ec0" +source = "git+https://github.com/ipfs-force-community/jsonrpc.git?branch=feat/0x5459/v18-do-not-check-http-status#3b2b3be5d92ce6c23529fa5aadc4a716c8964ed7" dependencies = [ "futures 0.3.28", "jsonrpc-client-transports", @@ -2110,8 +2107,7 @@ dependencies = [ [[package]] name = "jsonrpc-derive" version = "18.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b939a78fa820cdfcb7ee7484466746a7377760970f6f9c6fe19f9edcc8a38d2" +source = "git+https://github.com/ipfs-force-community/jsonrpc.git?branch=feat/0x5459/v18-do-not-check-http-status#3b2b3be5d92ce6c23529fa5aadc4a716c8964ed7" dependencies = [ "proc-macro-crate 0.1.5", "proc-macro2", @@ -2122,8 +2118,7 @@ dependencies = [ [[package]] name = "jsonrpc-http-server" version = "18.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1dea6e07251d9ce6a552abfb5d7ad6bc290a4596c8dcc3d795fae2bbdc1f3ff" +source = "git+https://github.com/ipfs-force-community/jsonrpc.git?branch=feat/0x5459/v18-do-not-check-http-status#3b2b3be5d92ce6c23529fa5aadc4a716c8964ed7" dependencies = [ "futures 0.3.28", "hyper", @@ -2138,8 +2133,7 @@ dependencies = [ [[package]] name = "jsonrpc-pubsub" version = "18.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240f87695e6c6f62fb37f05c02c04953cf68d6408b8c1c89de85c7a0125b1011" +source = "git+https://github.com/ipfs-force-community/jsonrpc.git?branch=feat/0x5459/v18-do-not-check-http-status#3b2b3be5d92ce6c23529fa5aadc4a716c8964ed7" dependencies = [ "futures 0.3.28", "jsonrpc-core", @@ -2153,8 +2147,7 @@ dependencies = [ [[package]] name = "jsonrpc-server-utils" version = "18.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4fdea130485b572c39a460d50888beb00afb3e35de23ccd7fad8ff19f0e0d4" +source = "git+https://github.com/ipfs-force-community/jsonrpc.git?branch=feat/0x5459/v18-do-not-check-http-status#3b2b3be5d92ce6c23529fa5aadc4a716c8964ed7" dependencies = [ "bytes", "futures 0.3.28", diff --git a/damocles-worker/Cargo.toml b/damocles-worker/Cargo.toml index aa497f41b..11a923e74 100644 --- a/damocles-worker/Cargo.toml +++ b/damocles-worker/Cargo.toml @@ -32,9 +32,10 @@ crossbeam-utils = "0.8" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "time"] } crossterm = "0.26" -jsonrpc-core = "18" -jsonrpc-derive = "18" -jsonrpc-http-server = "18" +jsonrpc-core-client = { git = "https://github.com/ipfs-force-community/jsonrpc.git", branch = "feat/0x5459/v18-do-not-check-http-status", features = ["tls", "http"] } +jsonrpc-core = { git = "https://github.com/ipfs-force-community/jsonrpc.git", branch = "feat/0x5459/v18-do-not-check-http-status" } +jsonrpc-derive = { git = "https://github.com/ipfs-force-community/jsonrpc.git", branch = "feat/0x5459/v18-do-not-check-http-status" } +jsonrpc-http-server = { git = "https://github.com/ipfs-force-community/jsonrpc.git", branch = "feat/0x5459/v18-do-not-check-http-status" } toml = "0.7" humantime-serde = "1" clap = { version = "4.2", features = ["derive", "env"] } @@ -68,10 +69,6 @@ features = ["json"] [dependencies.rocksdb] version = "0.20" -[dependencies.jsonrpc-core-client] -version = "18" -features = ["tls", "http"] - [dependencies.byte-unit] version = "4" features = ["serde"] diff --git a/damocles-worker/src/bin/damocles-worker/worker/mod.rs b/damocles-worker/src/bin/damocles-worker/worker/mod.rs index fea4e2a47..f66292200 100644 --- a/damocles-worker/src/bin/damocles-worker/worker/mod.rs +++ b/damocles-worker/src/bin/damocles-worker/worker/mod.rs @@ -76,11 +76,11 @@ pub(crate) fn run(cmd: &WorkerCommand) -> Result<()> { for wi in infos { let _ = writeln!( &mut hdl, - "#{}: {:?}; plan={}, sector_id={:?}, paused={}, paused_elapsed={:?}, state={}, last_err={:?}", + "#{}: {}; plan={}, job_id={:?}, paused={}, paused_elapsed={:?}, state={}, last_err={:?}", wi.index, - wi.location, + wi.location.display(), wi.plan, - wi.sector_id, + wi.job_id, wi.paused, wi.paused_elapsed.map(Duration::from_secs), wi.state.as_str(), diff --git a/damocles-worker/src/config.rs b/damocles-worker/src/config.rs index 655544a54..6b1a9fc55 100644 --- a/damocles-worker/src/config.rs +++ b/damocles-worker/src/config.rs @@ -18,7 +18,7 @@ pub const DEFAULT_WORKER_SERVER_PORT: u16 = 17890; pub const DEFAULT_WORKER_SERVER_HOST: &str = "0.0.0.0"; /// The localhost addr pub const LOCAL_HOST: &str = "127.0.0.1"; -pub const DEFAULT_WORKER_PING_INTERVAL: Duration = Duration::from_secs(20); +pub const DEFAULT_WORKER_PING_INTERVAL: Duration = Duration::from_secs(30); /// configurations for sealing sectors #[derive(Debug, Clone, PartialEq, Eq)] @@ -139,7 +139,7 @@ pub struct Attached { #[derive(Debug, Default, Serialize, Deserialize)] pub struct SealingThread { /// store location - pub location: String, + pub location: Option, #[serde(flatten)] pub inner: SealingThreadInner, @@ -286,6 +286,7 @@ pub struct Config { pub attached: Option>, /// section for processors + #[serde(default)] pub processors: Processors, } diff --git a/damocles-worker/src/metadb/mod.rs b/damocles-worker/src/metadb/mod.rs index 3ac442980..21ea03198 100644 --- a/damocles-worker/src/metadb/mod.rs +++ b/damocles-worker/src/metadb/mod.rs @@ -208,8 +208,10 @@ where Ok(()) } - pub fn delete(self) -> anyhow::Result<()> { + pub fn delete(&mut self, mut default: impl FnMut() -> T) -> anyhow::Result<()> { self.db.remove(&self.key)?; + *self.data = default(); + self.data.sync(); Ok(()) } diff --git a/damocles-worker/src/rpc/sealer/mod.rs b/damocles-worker/src/rpc/sealer/mod.rs index 6dd0c84b3..a61d35d4e 100644 --- a/damocles-worker/src/rpc/sealer/mod.rs +++ b/damocles-worker/src/rpc/sealer/mod.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::fmt::Display; use std::path::PathBuf; use super::super::types::SealProof; @@ -11,7 +12,8 @@ use jsonrpc_derive::rpc; use serde::{Deserialize, Serialize}; use serde_repr::{Deserialize_repr, Serialize_repr}; use vc_processors::b64serde::{BytesArray32, BytesVec}; -use vc_processors::fil_proofs::{Commitment, PaddedBytesAmount, RegisteredPoStProof, SectorId, SnarkProof}; +use vc_processors::builtin::tasks::WindowPoStOutput; +use vc_processors::fil_proofs::{Commitment, PaddedBytesAmount, RegisteredPoStProof, SectorId}; /// type alias for BytesArray32 pub type Randomness = BytesArray32; @@ -32,12 +34,18 @@ pub struct SectorID { pub number: SectorNumber, } -impl std::fmt::Debug for SectorID { +impl Display for SectorID { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "s-t0{}-{}", self.miner, self.number) } } +impl std::fmt::Debug for SectorID { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + /// rules for allocating sector bases #[derive(Serialize, Deserialize)] #[serde(rename_all = "PascalCase")] @@ -377,42 +385,47 @@ pub struct SectorUnsealInfo { pub private_info: SectorPrivateInfo, } -#[derive(Deserialize, Serialize)] +#[derive(Deserialize, Serialize, Debug, Clone)] #[serde(rename_all = "PascalCase")] -pub struct PoStSectorInfo { +pub struct SectorAccessStores { + pub sealed_file: String, // name for storage instance + pub cache_dir: String, +} + +/// rules for allocating sector bases +#[derive(Serialize, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct AllocatePoStSpec { + /// specified miner actor ids + pub allowed_miners: Option>, + + /// specified seal proof types + pub allowed_proof_types: Option>, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "PascalCase")] +pub struct WdPoStSectorInfo { pub sector_id: SectorId, pub comm_r: Commitment, - pub access_instance: String, + pub upgrade: bool, // is upgrade sector + pub accesses: SectorAccessStores, } -#[derive(Deserialize, Serialize)] +#[derive(Deserialize, Serialize, Debug, Clone)] #[serde(rename_all = "PascalCase")] -pub struct WdPostTaskInfo { +pub struct WdPoStInput { + pub sectors: Vec, pub miner_id: ActorID, - pub deadline_id: u64, - pub sectors: Vec, - pub seed: ChallengeSeed, pub proof_type: RegisteredPoStProof, - pub instance: String, -} - -#[derive(Deserialize, Clone, Serialize)] -pub enum WdpostState { - Assigned, - Generating, - Generated, - Failed, - Done, - Error, + pub seed: ChallengeSeed, } -#[derive(Deserialize, Clone, Serialize)] +#[derive(Deserialize, Serialize, Debug, Clone)] #[serde(rename_all = "PascalCase")] -pub struct WdPoStResult { - pub state: WdpostState, - pub error: Option, - pub proofs: Option>, - pub faults: Option>, +pub struct AllocatedWdPoStJob { + pub id: String, + pub input: WdPoStInput, } /// defines the SealerRpc service @@ -506,9 +519,12 @@ pub trait Sealer { #[rpc(name = "Venus.AcquireUnsealDest")] fn acquire_unseal_dest(&self, id: SectorID, piece_cid: CidJson) -> Result>; - #[rpc(name = "Venus.WdPoStAllocateTasks")] - fn allocate_wd_post_task(&self, spec: AllocateSectorSpec) -> Result>; + #[rpc(name = "Venus.WdPoStAllocateJobs")] + fn allocate_wdpost_job(&self, spec: AllocatePoStSpec, num: u32, worker_name: String) -> Result>; + + #[rpc(name = "Venus.WdPoStHeartbeatJobs")] + fn wdpost_heartbeat(&self, job_ids: Vec, worker_name: String) -> Result<()>; - #[rpc(name = "Venus.WdPoStHeartbeatTask")] - fn wd_post_heartbeat(&self, miner_id: ActorID, deadline_id: u64, result: WdPoStResult) -> Result<()>; + #[rpc(name = "Venus.WdPoStFinishJob")] + fn wdpost_finish(&self, job_id: String, output: Option, error_reason: String) -> Result<()>; } diff --git a/damocles-worker/src/rpc/worker/mod.rs b/damocles-worker/src/rpc/worker/mod.rs index 6d613b903..0bfcd4ffc 100644 --- a/damocles-worker/src/rpc/worker/mod.rs +++ b/damocles-worker/src/rpc/worker/mod.rs @@ -4,8 +4,6 @@ use jsonrpc_core::Result; use jsonrpc_derive::rpc; use serde::{Deserialize, Serialize}; -use crate::rpc::sealer::SectorID; - /// information about each worker thread #[derive(Clone, Debug, Serialize, Deserialize)] pub struct WorkerInfo { @@ -15,7 +13,7 @@ pub struct WorkerInfo { /// current plan of the worker pub plan: String, - pub sector_id: Option, + pub job_id: Option, /// index for other control operations pub index: usize, diff --git a/damocles-worker/src/sealing/config.rs b/damocles-worker/src/sealing/config.rs index a2b0f3cae..009420c76 100644 --- a/damocles-worker/src/sealing/config.rs +++ b/damocles-worker/src/sealing/config.rs @@ -7,11 +7,10 @@ use serde::Deserialize; use crate::{ config::{Sealing, SealingOptional, SealingThreadInner}, + sealing::sealing_thread::default_plan, SealProof, }; -use super::sealing_thread::default_plan; - /// The config of the sealing thread pub struct Config { /// allowed miners parsed from config diff --git a/damocles-worker/src/sealing/mod.rs b/damocles-worker/src/sealing/mod.rs index 52e73b1bd..ec7c7f7d2 100644 --- a/damocles-worker/src/sealing/mod.rs +++ b/damocles-worker/src/sealing/mod.rs @@ -1,6 +1,7 @@ //! sealing mod pub(self) mod failure; +mod paths; pub mod ping; pub mod processor; pub mod resource; diff --git a/damocles-worker/src/sealing/paths.rs b/damocles-worker/src/sealing/paths.rs new file mode 100644 index 000000000..f8593da4a --- /dev/null +++ b/damocles-worker/src/sealing/paths.rs @@ -0,0 +1,23 @@ +use std::path::PathBuf; + +use crate::rpc::sealer::SectorID; + +pub fn sector_path(sector_id: &SectorID) -> String { + format!("s-t0{}-{}", sector_id.miner, sector_id.number) +} + +pub fn sealed_file(sector_id: &SectorID) -> PathBuf { + PathBuf::from("sealed").join(sector_path(sector_id)) +} + +pub fn cache_dir(sector_id: &SectorID) -> PathBuf { + PathBuf::from("cache").join(sector_path(sector_id)) +} + +pub fn update_file(sector_id: &SectorID) -> PathBuf { + PathBuf::from("update").join(sector_path(sector_id)) +} + +pub fn update_cache_dir(sector_id: &SectorID) -> PathBuf { + PathBuf::from("update-cache").join(sector_path(sector_id)) +} diff --git a/damocles-worker/src/sealing/sealing_thread/ctrl.rs b/damocles-worker/src/sealing/sealing_thread/ctrl.rs index 0e39be944..b4ca34428 100644 --- a/damocles-worker/src/sealing/sealing_thread/ctrl.rs +++ b/damocles-worker/src/sealing/sealing_thread/ctrl.rs @@ -4,10 +4,9 @@ use std::time::Instant; use anyhow::{anyhow, Result}; use crossbeam_channel::{bounded, Receiver, Sender}; -use super::{super::store::Location, State}; -use crate::rpc::sealer::SectorID; +use super::super::store::Location; -pub fn new_ctrl(loc: Location) -> (Ctrl, CtrlCtx) { +pub fn new_ctrl(loc: Option) -> (Ctrl, CtrlCtx) { let (pause_tx, pause_rx) = bounded(1); let (resume_tx, resume_rx) = bounded(0); let state = Arc::new(RwLock::new(Default::default())); @@ -29,9 +28,9 @@ pub fn new_ctrl(loc: Location) -> (Ctrl, CtrlCtx) { #[derive(Default)] pub struct CtrlJobState { - pub id: Option, + pub id: Option, pub plan: String, - pub state: State, + pub state: Option, pub last_error: Option, } @@ -42,9 +41,9 @@ pub struct CtrlState { } pub struct Ctrl { - pub location: Location, + pub location: Option, pub pause_tx: Sender<()>, - pub resume_tx: Sender>, + pub resume_tx: Sender>, state: Arc>, } @@ -60,7 +59,7 @@ impl Ctrl { pub struct CtrlCtx { pub pause_rx: Receiver<()>, - pub resume_rx: Receiver>, + pub resume_rx: Receiver>, state: Arc>, } diff --git a/damocles-worker/src/sealing/sealing_thread/task/entry.rs b/damocles-worker/src/sealing/sealing_thread/entry.rs similarity index 100% rename from damocles-worker/src/sealing/sealing_thread/task/entry.rs rename to damocles-worker/src/sealing/sealing_thread/entry.rs diff --git a/damocles-worker/src/sealing/sealing_thread/mod.rs b/damocles-worker/src/sealing/sealing_thread/mod.rs index 0cb046a80..5177b65db 100644 --- a/damocles-worker/src/sealing/sealing_thread/mod.rs +++ b/damocles-worker/src/sealing/sealing_thread/mod.rs @@ -11,46 +11,141 @@ use crate::store::Location; use crate::watchdog::{Ctx, Module}; use super::config::{merge_sealing_fields, Config}; -use super::{failure::*, store::Store}; +use super::failure::*; -mod task; -pub use task::default_plan; -use task::{sector::State, Task}; +mod planner; +pub use planner::default_plan; +pub mod entry; +#[macro_use] +mod util; +use util::*; mod ctrl; pub use ctrl::Ctrl; use ctrl::*; +pub trait Sealer { + fn seal(&mut self, state: Option<&str>) -> Result; +} + +pub enum R { + SwitchPlanner(String), + #[allow(dead_code)] + Wait(Duration), + Done, +} + +#[derive(Clone)] +pub struct SealingCtrl<'a> { + ctx: &'a Ctx, + ctrl_ctx: &'a CtrlCtx, + sealing_config: &'a Config, +} + +impl<'a> SealingCtrl<'a> { + pub fn config(&self) -> &Config { + self.sealing_config + } + + pub fn ctx(&self) -> &Ctx { + self.ctx + } + + pub fn ctrl_ctx(&self) -> &CtrlCtx { + self.ctrl_ctx + } + + pub fn interrupted(&self) -> Result<(), Failure> { + select! { + recv(self.ctx.done) -> _done_res => { + Err(Interrupt.into()) + } + + recv(self.ctrl_ctx.pause_rx) -> pause_res => { + pause_res.context("pause signal channel closed unexpectedly").crit()?; + Err(Interrupt.into()) + } + + default => { + Ok(()) + } + } + } + + pub fn wait_or_interrupted(&self, duration: Duration) -> Result<(), Failure> { + select! { + recv(self.ctx.done) -> _done_res => { + Err(Interrupt.into()) + } + + recv(self.ctrl_ctx.pause_rx) -> pause_res => { + pause_res.context("pause signal channel closed unexpectedly").crit()?; + Err(Interrupt.into()) + } + + default(duration) => { + Ok(()) + } + } + } +} + pub struct SealingThread { idx: usize, /// the config of this SealingThread pub config: Config, - pub store: Store, + location: Option, ctrl_ctx: CtrlCtx, } impl SealingThread { - pub fn new(idx: usize, plan: Option, sealing_config: Sealing, location: Location) -> Result<(Self, Ctrl)> { - let store_path = location.to_pathbuf(); - let store = Store::open(store_path).with_context(|| format!("open store {}", location.as_ref().display()))?; + pub fn new(idx: usize, plan: Option, sealing_config: Sealing, location: Option) -> Result<(Self, Ctrl)> { let (ctrl, ctrl_ctx) = new_ctrl(location.clone()); Ok(( Self { - config: Config::new(sealing_config, plan, Some(location.hot_config_path()))?, - store, - ctrl_ctx, idx, + config: Config::new(sealing_config, plan, location.as_ref().map(|x| x.hot_config_path()))?, + location, + ctrl_ctx, }, ctrl, )) } - fn seal_one(&mut self, ctx: &Ctx, state: Option) -> Result<(), Failure> { - let task = Task::build(ctx, &self.ctrl_ctx, &mut self.config, &mut self.store)?; - task.exec(state) + fn seal_one(&mut self, ctx: &Ctx, state: Option) -> Result<(), Failure> { + self.config + .reload_if_needed(|_, _| Ok(true)) + .context("reload sealing thread hot config") + .crit()?; + let mut plan = self.config.plan().to_string(); + loop { + self.ctrl_ctx + .update_state(|cst| cst.job.plan = plan.clone()) + .context("update ctrl state") + .crit()?; + let mut sealer = planner::create_selaer(&plan, ctx, self).crit()?; + match sealer.seal(state.as_deref())? { + R::SwitchPlanner(new_plan) => { + tracing::info!(new_plan = new_plan, "switch planner"); + plan = new_plan; + } + R::Wait(dur) => self.sealing_ctrl(ctx).wait_or_interrupted(dur)?, + R::Done => return Ok(()), + } + } + } + + fn sealing_ctrl(&self, ctx: &Ctx) -> SealingCtrl<'static> { + unsafe { + SealingCtrl { + ctx: extend_lifetime(ctx), + ctrl_ctx: extend_lifetime(&self.ctrl_ctx), + sealing_config: extend_lifetime(&self.config), + } + } } } @@ -140,7 +235,7 @@ impl Module for SealingThread { self.ctrl_ctx.update_state(|cst| { cst.job.id.take(); - let _ = std::mem::replace(&mut cst.job.state, State::Empty); + cst.job.state = None; })?; select! { @@ -167,17 +262,23 @@ pub(crate) fn build_sealing_threads( let sealing_config = customized_sealing_config(common, scfg.inner.sealing.as_ref()); let plan = scfg.inner.plan.as_ref().cloned(); - let store_path = PathBuf::from(&scfg.location) - .canonicalize() - .with_context(|| format!("canonicalize store path {}", &scfg.location))?; - if path_set.contains(&store_path) { - tracing::warn!(path = ?store_path, "store already loaded"); - continue; - } + let loc = match &scfg.location { + Some(loc) => { + let store_path = PathBuf::from(loc) + .canonicalize() + .with_context(|| format!("canonicalize store path {}", loc))?; + if path_set.contains(&store_path) { + tracing::warn!(path = ?store_path, "store already loaded"); + continue; + } + path_set.insert(store_path.clone()); + Some(Location::new(store_path)) + } + None => None, + }; - let (sealing_thread, ctrl) = SealingThread::new(idx, plan, sealing_config, Location::new(store_path.clone()))?; + let (sealing_thread, ctrl) = SealingThread::new(idx, plan, sealing_config, loc)?; sealing_threads.push((sealing_thread, (idx, ctrl))); - path_set.insert(store_path); } Ok(sealing_threads) @@ -192,3 +293,14 @@ fn customized_sealing_config(common: &SealingOptional, customized: Option<&Seali common_sealing } } + +/// The caller is responsible for ensuring lifetime validity +pub const unsafe fn extend_lifetime<'b, T>(inp: &T) -> &'b T { + std::mem::transmute(inp) +} + +/// The caller is responsible for ensuring lifetime validity +#[allow(dead_code)] +pub unsafe fn extend_lifetime_mut<'b, T>(inp: &mut T) -> &'b mut T { + std::mem::transmute(inp) +} diff --git a/damocles-worker/src/sealing/sealing_thread/planner/common.rs b/damocles-worker/src/sealing/sealing_thread/planner/common.rs new file mode 100644 index 000000000..60ccb47e2 --- /dev/null +++ b/damocles-worker/src/sealing/sealing_thread/planner/common.rs @@ -0,0 +1,227 @@ +//! this module provides some common handlers + +pub(crate) mod event; +pub(crate) mod sealing; +pub(crate) mod sector; +pub(crate) mod task; + +use std::{pin::Pin, str::FromStr}; + +use anyhow::{Context, Result}; +pub use sealing::*; + +use crate::{ + rpc::sealer::{SectorFailure, SectorStateChange}, + sealing::{ + failure::{Failure, FailureContext, IntoFailure, Level, MapErrToFailure}, + sealing_thread::{extend_lifetime, Sealer, SealingThread, R}, + }, + store::Store, + watchdog::Ctx, +}; + +use self::{event::Event, sector::State, task::Task}; + +use super::PlannerTrait; + +pub struct CommonSealer

{ + job: Task, + planner: P, + _store: Pin>, +} + +impl

Sealer for CommonSealer

+where + P: PlannerTrait, +{ + fn seal(&mut self, state: Option<&str>) -> Result { + let mut event = state.and_then(|s| State::from_str(s).ok()).map(Event::SetState); + if let (true, Some(s)) = (event.is_none(), state) { + tracing::error!("unknown state: {}", s); + } + + let mut task_idle_count = 0; + loop { + let span = tracing::error_span!( + "seal", + miner = ?self.job.sector.base.as_ref().map(|b| b.allocated.id.miner), + sector = ?self.job.sector.base.as_ref().map(|b| b.allocated.id.number), + ?event, + ); + + let _enter = span.enter(); + + let prev = self.job.sector.state; + let is_empty = match self.job.sector.base.as_ref() { + None => true, + Some(base) => { + self.job + .sealing_ctrl + .ctrl_ctx() + .update_state(|cst| { + cst.job.id.replace(base.allocated.id.to_string()); + }) + .crit()?; + false + } + }; + + if self.job.sector.plan() != self.planner.name() { + return Ok(R::SwitchPlanner(self.job.sector.plan().to_string())); + } + + self.job.sealing_ctrl.interrupted()?; + + let handle_res = self.handle(event.take()); + if is_empty { + if let Some(base) = self.job.sector.base.as_ref() { + self.job + .sealing_ctrl + .ctrl_ctx() + .update_state(|cst| { + cst.job.id.replace(base.allocated.id.to_string()); + }) + .crit()?; + } + } else if self.job.sector.base.is_none() { + self.job + .sealing_ctrl + .ctrl_ctx() + .update_state(|cst| { + cst.job.id.take(); + }) + .crit()?; + } + + let fail = if let Err(eref) = handle_res.as_ref() { + Some(SectorFailure { + level: format!("{:?}", eref.0), + desc: format!("{:?}", eref.1), + }) + } else { + None + }; + + if let Err(rerr) = self.job.report_state( + SectorStateChange { + prev: prev.as_str().to_owned(), + next: self.job.sector.state.as_str().to_owned(), + event: format!("{:?}", event), + }, + fail, + ) { + tracing::error!("report state failed: {:?}", rerr); + }; + + match handle_res { + Ok(Some(evt)) => { + if let Event::Idle = &evt { + task_idle_count += 1; + if task_idle_count > self.job.sealing_ctrl.config().request_task_max_retries { + tracing::info!( + "The task has returned `Event::Idle` for more than {} times. break the task", + self.job.sealing_ctrl.config().request_task_max_retries + ); + + // when the planner tries to request a task but fails(including no task) for more than + // `config::sealing::request_task_max_retries` times, this task is really considered idle, + // break this task loop. that we have a chance to reload `sealing_thread` hot config file, + // or do something else. + + if self.job.sealing_ctrl.config().check_modified() { + // cleanup sector if the hot config modified + self.job.finalize()?; + } + return Ok(R::Done); + } + } + event.replace(evt); + } + + Ok(None) => match self.job.report_finalized().context("report finalized") { + Ok(_) => { + self.job.finalize()?; + return Ok(R::Done); + } + Err(terr) => self.job.retry(terr.1)?, + }, + + Err(Failure(Level::Abort, aerr)) => { + if let Err(rerr) = self.job.report_aborted(aerr.to_string()) { + tracing::error!("report aborted sector failed: {:?}", rerr); + } + + tracing::warn!("cleanup aborted sector"); + self.job.finalize()?; + return Err(aerr.abort()); + } + + Err(Failure(Level::Temporary, terr)) => self.job.retry(terr)?, + + Err(f) => return Err(f), + } + } + } +} + +impl

CommonSealer

+where + P: PlannerTrait, +{ + pub fn new(ctx: &Ctx, st: &SealingThread) -> Result + where + Self: Sized, + { + let location = st.location.as_ref().expect("location must be set"); + let store_path = location.to_pathbuf(); + let store = Box::pin(Store::open(store_path).with_context(|| format!("open store {}", location.as_ref().display()))?); + + Ok(Self { + job: Task::build(st.sealing_ctrl(ctx), unsafe { extend_lifetime(&*store.as_ref()) }).context("build tesk")?, + planner: P::default(), + _store: store, + }) + } + + fn handle(&mut self, event: Option) -> Result, Failure> { + let prev = self.job.sector.state; + + if let Some(evt) = event { + match evt { + Event::Idle | Event::Retry => { + tracing::debug!( + prev = ?self.job.sector.state, + sleep = ?self.job.sealing_ctrl.config().recover_interval, + "Event::{:?} captured", evt + ); + + self.job + .sealing_ctrl + .wait_or_interrupted(self.job.sealing_ctrl.config().recover_interval)?; + } + + _ => { + let state = self.planner.plan(&evt, &self.job.sector.state).crit()?; + self.planner.apply(evt, state, &mut self.job).context("event apply").crit()?; + self.job.sector.sync().context("sync sector").crit()?; + } + }; + }; + + let span = tracing::warn_span!("handle", ?prev, current = ?self.job.sector.state); + + let _enter = span.enter(); + + self.job + .sealing_ctrl + .ctrl_ctx() + .update_state(|cst| { + let _ = cst.job.state.replace(self.job.sector.state.as_str().to_string()); + }) + .crit()?; + + tracing::debug!("handling"); + + self.planner.exec(&mut self.job) + } +} diff --git a/damocles-worker/src/sealing/sealing_thread/task/event.rs b/damocles-worker/src/sealing/sealing_thread/planner/common/event.rs similarity index 91% rename from damocles-worker/src/sealing/sealing_thread/task/event.rs rename to damocles-worker/src/sealing/sealing_thread/planner/common/event.rs index 84ac9da6b..ec22ac969 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/event.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/common/event.rs @@ -2,21 +2,14 @@ use std::fmt::{self, Debug}; use anyhow::{anyhow, Result}; -use super::{ - sector::{Base, Finalized, Sector, State}, - Planner, -}; +use super::sector::{Base, Finalized, Sector, State, UnsealInput}; +use super::task::Task; +use crate::rpc::sealer::{AllocatedSector, Deals, SectorRebuildInfo, SectorUnsealInfo, Seed, Ticket}; use crate::sealing::processor::{ to_prover_id, PieceInfo, SealCommitPhase1Output, SealCommitPhase2Output, SealPreCommitPhase1Output, SealPreCommitPhase2Output, SectorId, SnapEncodeOutput, }; use crate::{logging::trace, metadb::MaybeDirty}; -use crate::{ - rpc::sealer::{AllocatedSector, Deals, SectorRebuildInfo, SectorUnsealInfo, Seed, Ticket, WdPostTaskInfo}, - sealing::sealing_thread::task::sector::UnsealInput, -}; - -use vc_processors::builtin::tasks::WindowPoStOutput; pub enum Event { SetState(State), @@ -87,6 +80,8 @@ pub enum Event { UnsealDone(u64), UploadPieceDone, + + UnsealReady, } impl Debug for Event { @@ -155,6 +150,8 @@ impl Debug for Event { Self::UnsealDone(_) => "Unsealed", Self::UploadPieceDone => "UploadPieceDone", + + Self::UnsealReady => "UnsealReady", }; f.write_str(name) @@ -176,19 +173,15 @@ macro_rules! mem_replace { } impl Event { - pub fn apply(self, p: &P, s: &mut MaybeDirty) -> Result<()> { - let next = if let Event::SetState(s) = self { - s - } else { - p.plan(&self, &s.state)? - }; + pub fn apply(self, state: State, task: &mut Task) -> Result<()> { + let next = if let Event::SetState(s) = self { s } else { state }; - if next == s.state { + if next == task.sector.state { return Err(anyhow!("state unchanged, may enter an infinite loop")); } - self.apply_changes(s); - s.update_state(next); + self.apply_changes(task.sector.inner_mut()); + task.sector.update_state(next); Ok(()) } @@ -316,10 +309,6 @@ impl Event { ); } - Self::WdPostGenerated(out) => { - replace!(s.phases.wd_post_out, out); - } - _ => {} }; } diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/common.rs b/damocles-worker/src/sealing/sealing_thread/planner/common/sealing.rs similarity index 77% rename from damocles-worker/src/sealing/sealing_thread/task/planner/common.rs rename to damocles-worker/src/sealing/sealing_thread/planner/common/sealing.rs index 159ab0861..9e6a809c4 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/common.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/common/sealing.rs @@ -1,27 +1,37 @@ -//! this module provides some common handlers - -use std::collections::HashMap; -use std::fs::{create_dir_all, remove_dir_all, remove_file}; -use std::os::unix::fs::symlink; -use std::path::Path; +use std::{ + collections::HashMap, + fs::{create_dir_all, remove_dir_all, remove_file}, + os::unix::fs::symlink, + path::Path, +}; use anyhow::{anyhow, Context, Result}; use vc_processors::builtin::tasks::{Piece, PieceFile, STAGE_NAME_ADD_PIECES, STAGE_NAME_TREED}; -use super::super::{call_rpc, cloned_required, field_required, Entry, Task}; -use crate::logging::debug; -use crate::rpc::sealer::{Deals, SectorID, Seed, Ticket}; -use crate::sealing::failure::*; -use crate::sealing::processor::{ - cached_filenames_for_sector, seal_commit_phase1, snap_generate_partition_proofs, snap_verify_sector_update_proof, tree_d_path_in_dir, - AddPiecesInput, PC1Input, PC2Input, PieceInfo, SealCommitPhase1Output, SealPreCommitPhase1Output, SealPreCommitPhase2Output, - SnapEncodeInput, SnapEncodeOutput, SnapProveInput, SnapProveOutput, TransferInput, TransferItem, TransferRoute, TransferStoreInfo, - TreeDInput, UnpaddedBytesAmount, STAGE_NAME_C1, STAGE_NAME_PC1, STAGE_NAME_PC2, STAGE_NAME_SNAP_ENCODE, STAGE_NAME_SNAP_PROVE, +use crate::{ + rpc::sealer::{Deals, SectorID, Seed, Ticket}, + sealing::{ + failure::{Failure, IntoFailure, MapErrToFailure, MapStdErrToFailure}, + processor::{ + cached_filenames_for_sector, seal_commit_phase1, snap_generate_partition_proofs, snap_verify_sector_update_proof, + tree_d_path_in_dir, AddPiecesInput, PC1Input, PC2Input, PieceInfo, SealCommitPhase1Output, SealPreCommitPhase1Output, + SealPreCommitPhase2Output, SnapEncodeInput, SnapEncodeOutput, SnapProveInput, SnapProveOutput, TransferInput, TransferItem, + TransferRoute, TransferStoreInfo, TreeDInput, UnpaddedBytesAmount, STAGE_NAME_C1, STAGE_NAME_PC1, STAGE_NAME_PC2, + STAGE_NAME_SNAP_ENCODE, STAGE_NAME_SNAP_PROVE, + }, + sealing_thread::{ + entry::Entry, + util::{call_rpc, cloned_required, field_required}, + }, + }, + types::SIZE_32G, + SealProof, }; -use crate::types::{SealProof, SIZE_32G}; -pub fn add_pieces(task: &Task<'_>, deals: &Deals) -> Result, Failure> { - let _token = task.ctx.global.limit.acquire(STAGE_NAME_ADD_PIECES).crit()?; +use super::task::Task; + +pub fn add_pieces(task: &Task, deals: &Deals) -> Result, Failure> { + let _token = task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_ADD_PIECES).crit()?; let seal_proof_type = task.sector_proof_type()?.into(); let staged_filepath = task.staged_file(task.sector_id()?); @@ -31,7 +41,7 @@ pub fn add_pieces(task: &Task<'_>, deals: &Deals) -> Result, Fail staged_filepath.prepare().context("prepare staged file").perm()?; } - let piece_store = task.ctx.global.piece_store.as_ref(); + let piece_store = task.sealing_ctrl.ctx().global.piece_store.as_ref(); let mut pieces = Vec::with_capacity(deals.len()); @@ -54,7 +64,8 @@ pub fn add_pieces(task: &Task<'_>, deals: &Deals) -> Result, Fail }) } - task.ctx + task.sealing_ctrl + .ctx() .global .processors .add_pieces @@ -68,11 +79,11 @@ pub fn add_pieces(task: &Task<'_>, deals: &Deals) -> Result, Fail } // build tree_d inside `prepare_dir` if necessary -pub fn build_tree_d(task: &'_ Task<'_>, allow_static: bool) -> Result<(), Failure> { +pub fn build_tree_d(task: &Task, allow_static: bool) -> Result<(), Failure> { let sector_id = task.sector_id()?; let proof_type = task.sector_proof_type()?; - let token = task.ctx.global.limit.acquire(STAGE_NAME_TREED).crit()?; + let token = task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_TREED).crit()?; let prepared_dir = task.prepared_dir(sector_id); prepared_dir.prepare().perm()?; @@ -86,7 +97,7 @@ pub fn build_tree_d(task: &'_ Task<'_>, allow_static: bool) -> Result<(), Failur // pledge sector if allow_static && task.sector.deals.as_ref().map(|d| d.len()).unwrap_or(0) == 0 { - if let Some(static_tree_path) = task.ctx.global.static_tree_d.get(&proof_type.sector_size()) { + if let Some(static_tree_path) = task.sealing_ctrl.ctx().global.static_tree_d.get(&proof_type.sector_size()) { symlink(static_tree_path, tree_d_path_in_dir(prepared_dir.as_ref())).crit()?; return Ok(()); } @@ -94,7 +105,8 @@ pub fn build_tree_d(task: &'_ Task<'_>, allow_static: bool) -> Result<(), Failur let staged_file = task.staged_file(sector_id); - task.ctx + task.sealing_ctrl + .ctx() .global .processors .tree_d @@ -116,16 +128,16 @@ fn cleanup_before_pc1(cache_dir: &Entry, sealed_file: &Entry) -> Result<()> { remove_dir_all(cache_dir)?; } create_dir_all(cache_dir_path)?; - debug!("init cache dir {:?} before pc1", cache_dir_path); + tracing::debug!("init cache dir {:?} before pc1", cache_dir_path); let _ = sealed_file.init_file()?; - debug!("truncate sealed file {:?} before pc1", sealed_file); + tracing::debug!("truncate sealed file {:?} before pc1", sealed_file); Ok(()) } -pub fn pre_commit1(task: &'_ Task<'_>) -> Result<(Ticket, SealPreCommitPhase1Output), Failure> { - let token = task.ctx.global.limit.acquire(STAGE_NAME_PC1).crit()?; +pub fn pre_commit1(task: &Task) -> Result<(Ticket, SealPreCommitPhase1Output), Failure> { + let token = task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_PC1).crit()?; let sector_id = task.sector_id()?; let proof_type = task.sector_proof_type()?; @@ -135,11 +147,9 @@ pub fn pre_commit1(task: &'_ Task<'_>) -> Result<(Ticket, SealPreCommitPhase1Out Some(ticket) => ticket.clone(), None => { let ticket = call_rpc! { - task.ctx.global.rpc, - assign_ticket, - sector_id.clone(), + task.rpc() => assign_ticket(sector_id.clone(),) }?; - debug!(ticket = ?ticket.ticket.0, epoch = ticket.epoch, "ticket assigned from sector-manager"); + tracing::debug!(ticket = ?ticket.ticket.0, epoch = ticket.epoch, "ticket assigned from sector-manager"); ticket } }; @@ -163,7 +173,8 @@ pub fn pre_commit1(task: &'_ Task<'_>) -> Result<(Ticket, SealPreCommitPhase1Out } let out = task - .ctx + .sealing_ctrl + .ctx() .global .processors .pc1 @@ -196,7 +207,7 @@ fn cleanup_before_pc2(cache_dir: &Path) -> Result<()> { let p = entry.path(); remove_file(&p).with_context(|| format!("remove cached file {:?}", p))?; - debug!("remove cached file {:?} before pc2", p); + tracing::debug!("remove cached file {:?} before pc2", p); } } @@ -204,7 +215,7 @@ fn cleanup_before_pc2(cache_dir: &Path) -> Result<()> { } pub fn pre_commit2(task: &'_ Task) -> Result { - let token = task.ctx.global.limit.acquire(STAGE_NAME_PC2).crit()?; + let token = task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_PC2).crit()?; let sector_id = task.sector_id()?; @@ -219,7 +230,8 @@ pub fn pre_commit2(task: &'_ Task) -> Result cleanup_before_pc2(cache_dir.as_ref()).crit()?; let out = task - .ctx + .sealing_ctrl + .ctx() .global .processors .pc2 @@ -235,7 +247,7 @@ pub fn pre_commit2(task: &'_ Task) -> Result } pub fn commit1_with_seed(task: &Task, seed: Seed) -> Result { - let token = task.ctx.global.limit.acquire(STAGE_NAME_C1).crit()?; + let token = task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_C1).crit()?; let sector_id = task.sector_id()?; @@ -281,7 +293,7 @@ pub fn commit1_with_seed(task: &Task, seed: Seed) -> Result Result { - let _token = task.ctx.global.limit.acquire(STAGE_NAME_SNAP_ENCODE).crit()?; + let _token = task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_SNAP_ENCODE).crit()?; cloned_required!(piece_infos, task.sector.phases.pieces); @@ -291,18 +303,18 @@ pub fn snap_encode(task: &Task, sector_id: &SectorID, proof_type: &SealProof) -> // init update file let update_file = task.update_file(sector_id); - debug!(path=?update_file.full(), "trying to init update file"); + tracing::debug!(path=?update_file.full(), "trying to init update file"); { let file = update_file.init_file().perm()?; file.set_len(proof_type.sector_size()).context("fallocate for update file").perm()?; } let update_cache_dir = task.update_cache_dir(sector_id); - debug!(path=?update_cache_dir.full(), "trying to init update cache dir"); + tracing::debug!(path=?update_cache_dir.full(), "trying to init update cache dir"); update_cache_dir.prepare().context("prepare update cache dir").perm()?; // tree d - debug!("trying to prepare tree_d"); + tracing::debug!("trying to prepare tree_d"); let prepared_dir = task.prepared_dir(sector_id); symlink( tree_d_path_in_dir(prepared_dir.as_ref()), @@ -314,7 +326,8 @@ pub fn snap_encode(task: &Task, sector_id: &SectorID, proof_type: &SealProof) -> // staged file should be already exists, do nothing let staged_file = task.staged_file(sector_id); - task.ctx + task.sealing_ctrl + .ctx() .global .processors .snap_encode @@ -331,7 +344,7 @@ pub fn snap_encode(task: &Task, sector_id: &SectorID, proof_type: &SealProof) -> } pub fn snap_prove(task: &Task) -> Result { - let _token = task.ctx.global.limit.acquire(STAGE_NAME_SNAP_PROVE).crit()?; + let _token = task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_SNAP_PROVE).crit()?; let sector_id = task.sector_id()?; let proof_type = task.sector_proof_type()?; @@ -356,7 +369,8 @@ pub fn snap_prove(task: &Task) -> Result { .perm()?; let proof = task - .ctx + .sealing_ctrl + .ctx() .global .processors .snap_prove @@ -381,7 +395,7 @@ pub fn snap_prove(task: &Task) -> Result { // acquire a persist store for sector files, copy the files and return the instance name of the // acquired store -pub fn persist_sector_files(task: &'_ Task<'_>, cache_dir: Entry, sealed_file: Entry) -> Result { +pub fn persist_sector_files(task: &Task, cache_dir: Entry, sealed_file: Entry) -> Result { let sector_id = task.sector_id()?; let proof_type = task.sector_proof_type()?; let sector_size = proof_type.sector_size(); @@ -394,18 +408,14 @@ pub fn persist_sector_files(task: &'_ Task<'_>, cache_dir: Entry, sealed_file: E sector_size + sector_size / 50 }; - let candidates = task.ctx.global.attached.available_instances(); + let candidates = task.sealing_ctrl.ctx().global.attached.available_instances(); if candidates.is_empty() { return Err(anyhow!("no available local persist store candidate")).perm(); } let ins_info = loop { let res = call_rpc! { - task.ctx.global.rpc, - store_reserve_space, - sector_id.clone(), - required_size, - candidates.clone(), + task.rpc() => store_reserve_space(sector_id.clone(), required_size, candidates.clone(),) }?; if let Some(selected) = res { @@ -418,11 +428,13 @@ pub fn persist_sector_files(task: &'_ Task<'_>, cache_dir: Entry, sealed_file: E candidates=?candidates, "no persist store selected, wait for next polling" ); - task.wait_or_interrupted(task.sealing_config.rpc_polling_interval)?; + task.sealing_ctrl + .wait_or_interrupted(task.sealing_ctrl.config().rpc_polling_interval)?; }; let persist_store = task - .ctx + .sealing_ctrl + .ctx() .global .attached .get(&ins_info.name) @@ -430,7 +442,7 @@ pub fn persist_sector_files(task: &'_ Task<'_>, cache_dir: Entry, sealed_file: E .perm()?; let ins_name = persist_store.instance(); - debug!(name = %ins_name, "persist store acquired"); + tracing::debug!(name = %ins_name, "persist store acquired"); let mut wanted = vec![sealed_file]; wanted.extend( @@ -472,7 +484,8 @@ pub fn persist_sector_files(task: &'_ Task<'_>, cache_dir: Entry, sealed_file: E routes: transfer_routes, }; - task.ctx + task.sealing_ctrl + .ctx() .global .processors .transfer @@ -492,11 +505,7 @@ pub fn submit_persisted(task: &Task, is_upgrade: bool) -> Result<(), Failure> { } let checked = call_rpc! { - task.ctx.global.rpc, - submit_persisted_ex, - sector_id.clone(), - instance, - is_upgrade, + task.rpc() => submit_persisted_ex(sector_id.clone(), instance,is_upgrade,) }?; if checked { diff --git a/damocles-worker/src/sealing/sealing_thread/task/sector.rs b/damocles-worker/src/sealing/sealing_thread/planner/common/sector.rs similarity index 94% rename from damocles-worker/src/sealing/sealing_thread/task/sector.rs rename to damocles-worker/src/sealing/sealing_thread/planner/common/sector.rs index e603fb850..45281400f 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/sector.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/common/sector.rs @@ -6,12 +6,12 @@ use serde_repr::{Deserialize_repr, Serialize_repr}; pub use fil_clock::ChainEpoch; pub use fil_types::{InteractiveSealRandomness, PieceInfo as DealInfo, Randomness}; -use crate::rpc::sealer::{AllocatedSector, Deals, SectorPrivateInfo, SectorPublicInfo, Seed, Ticket, WdPostTaskInfo}; +use crate::rpc::sealer::{AllocatedSector, Deals, SectorPrivateInfo, SectorPublicInfo, Seed, Ticket}; use crate::sealing::processor::{ PieceInfo, ProverId, SealCommitPhase1Output, SealCommitPhase2Output, SealPreCommitPhase1Output, SealPreCommitPhase2Output, SectorId, SnapEncodeOutput, }; -use vc_processors::builtin::tasks::WindowPoStOutput; +use crate::sealing::sealing_thread::default_plan; const CURRENT_SECTOR_VERSION: u32 = 1; @@ -25,7 +25,7 @@ macro_rules! def_state { )+ } - impl State { + impl State { pub fn as_str(&self) -> &'static str { match self { $( @@ -82,6 +82,7 @@ def_state! { SnapTreeDBuilt, SnapDone, Unsealed, + UnsealPrepared, } impl std::fmt::Debug for State { @@ -140,11 +141,6 @@ pub struct Phases { // unseal pub unseal_in: Option, - - // window PoST - pub wd_post_in: Option, - - pub wd_post_out: Option, } #[derive(Debug, Deserialize, Serialize)] @@ -200,4 +196,8 @@ impl Sector { let prev = std::mem::replace(&mut self.state, next); self.prev_state.replace(prev); } + + pub fn plan(&self) -> &str { + self.plan.as_deref().unwrap_or_else(|| default_plan()) + } } diff --git a/damocles-worker/src/sealing/sealing_thread/planner/common/task.rs b/damocles-worker/src/sealing/sealing_thread/planner/common/task.rs new file mode 100644 index 000000000..615df3f14 --- /dev/null +++ b/damocles-worker/src/sealing/sealing_thread/planner/common/task.rs @@ -0,0 +1,216 @@ +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use forest_cid::json::CidJson; + +use crate::sealing::failure::{Failure, IntoFailure, MapErrToFailure}; +use crate::sealing::paths; +use crate::store::Store; +use crate::types::SealProof; +use crate::{ + metadb::{rocks::RocksMeta, MaybeDirty, MetaDocumentDB, PrefixedMetaDB, Saved}, + rpc::sealer::{ReportStateReq, SectorFailure, SectorID, SectorStateChange, WorkerIdentifier}, +}; +use crate::{ + rpc::sealer::SealerClient, + sealing::sealing_thread::{ + entry::Entry, + planner::JobTrait, + util::{call_rpc, field_required}, + SealingCtrl, + }, +}; + +use super::sector::{Sector, Trace}; + +const SECTOR_INFO_KEY: &str = "info"; +const SECTOR_META_PREFIX: &str = "meta"; +const SECTOR_TRACE_PREFIX: &str = "trace"; + +pub struct Task { + pub sector: Saved>, + _trace: Vec, + + pub sealing_ctrl: SealingCtrl<'static>, + store: &'static Store, + ident: WorkerIdentifier, + + _trace_meta: MetaDocumentDB>, +} + +// properties +impl Task { + pub fn sector_id(&self) -> Result<&SectorID, Failure> { + field_required! { + sector_id, + self.sector.base.as_ref().map(|b| &b.allocated.id) + } + + Ok(sector_id) + } + + pub fn sector_proof_type(&self) -> Result<&SealProof, Failure> { + field_required! { + proof_type, + self.sector.base.as_ref().map(|b| &b.allocated.proof_type) + } + + Ok(proof_type) + } + + pub fn rpc(&self) -> &SealerClient { + self.sealing_ctrl.ctx.global.rpc.as_ref() + } +} + +impl JobTrait for Task { + fn planner(&self) -> &str { + self.sector.plan() + } +} + +impl Task { + pub fn build(sealing_ctrl: SealingCtrl<'static>, s: &'static Store) -> Result { + let sector_meta = PrefixedMetaDB::wrap(SECTOR_META_PREFIX, &s.meta); + + let mut sector: Saved = Saved::load(SECTOR_INFO_KEY, sector_meta, || { + Sector::new(sealing_ctrl.config().plan().to_string()) + }) + .context("load sector")?; + sector.sync().context("init sync sector")?; + + let trace_meta = MetaDocumentDB::wrap(PrefixedMetaDB::wrap(SECTOR_TRACE_PREFIX, &s.meta)); + let instance = sealing_ctrl.ctx().instance.clone(); + + Ok(Task { + sector, + _trace: Vec::with_capacity(16), + sealing_ctrl, + store: s, + ident: WorkerIdentifier { + instance, + location: s.location.to_pathbuf(), + }, + + _trace_meta: trace_meta, + }) + } + + pub fn report_state(&self, state_change: SectorStateChange, fail: Option) -> Result<(), Failure> { + let sector_id = match self.sector.base.as_ref().map(|base| base.allocated.id.clone()) { + Some(sid) => sid, + None => return Ok(()), + }; + + call_rpc! { + self.sealing_ctrl.ctx().global.rpc=>report_state( + sector_id, + ReportStateReq { + worker: self.ident.clone(), + state_change, + failure: fail, + }, + )}?; + + Ok(()) + } + + pub fn report_finalized(&self) -> Result<(), Failure> { + let sector_id = match self.sector.base.as_ref().map(|base| base.allocated.id.clone()) { + Some(sid) => sid, + None => return Ok(()), + }; + + call_rpc! { + self.sealing_ctrl.ctx.global.rpc => report_finalized(sector_id,) + }?; + + Ok(()) + } + + pub fn report_aborted(&self, reason: String) -> Result<(), Failure> { + let sector_id = match self.sector.base.as_ref().map(|base| base.allocated.id.clone()) { + Some(sid) => sid, + None => return Ok(()), + }; + + call_rpc! { + self.sealing_ctrl.ctx.global.rpc=>report_aborted(sector_id, reason,) + }?; + + Ok(()) + } + + pub fn retry(&mut self, temp_err: anyhow::Error) -> Result<(), Failure> { + if self.sector.retry >= self.sealing_ctrl.config().max_retries { + // reset retry times + self.sync(|s| { + s.retry = 0; + Ok(()) + })?; + + return Err(temp_err.perm()); + } + + self.sync(|s| { + tracing::warn!(retry = s.retry, "temp error occurred: {:?}", temp_err); + + s.retry += 1; + + Ok(()) + })?; + + tracing::info!( + interval = ?self.sealing_ctrl.config().recover_interval, + "wait before recovering" + ); + + self.sealing_ctrl.wait_or_interrupted(self.sealing_ctrl.config().recover_interval)?; + Ok(()) + } + + fn sync) -> Result<()>>(&mut self, modify_fn: F) -> Result<(), Failure> { + modify_fn(self.sector.inner_mut()).crit()?; + self.sector.sync().context("sync sector").crit() + } + + pub fn finalize(&mut self) -> Result<(), Failure> { + self.store.cleanup().context("cleanup store").crit()?; + self.sector + .delete(|| Sector::new(self.sealing_ctrl.config().plan().to_string())) + .context("remove sector") + .crit() + } + + pub fn sector_path(&self, sector_id: &SectorID) -> String { + paths::sector_path(sector_id) + } + + pub fn prepared_dir(&self, sector_id: &SectorID) -> Entry { + Entry::dir(&self.store.data_path, PathBuf::from("prepared").join(self.sector_path(sector_id))) + } + + pub fn cache_dir(&self, sector_id: &SectorID) -> Entry { + Entry::dir(&self.store.data_path, paths::cache_dir(sector_id)) + } + + pub fn sealed_file(&self, sector_id: &SectorID) -> Entry { + Entry::file(&self.store.data_path, paths::sealed_file(sector_id)) + } + + pub fn staged_file(&self, sector_id: &SectorID) -> Entry { + Entry::file(&self.store.data_path, PathBuf::from("unsealed").join(self.sector_path(sector_id))) + } + + pub fn piece_file(&self, piece_cid: &CidJson) -> Entry { + Entry::file(&self.store.data_path, PathBuf::from("unsealed").join(format!("{}", piece_cid.0))) + } + + pub fn update_file(&self, sector_id: &SectorID) -> Entry { + Entry::file(&self.store.data_path, paths::update_file(sector_id)) + } + + pub fn update_cache_dir(&self, sector_id: &SectorID) -> Entry { + Entry::dir(&self.store.data_path, paths::update_cache_dir(sector_id)) + } +} diff --git a/damocles-worker/src/sealing/sealing_thread/planner/mod.rs b/damocles-worker/src/sealing/sealing_thread/planner/mod.rs new file mode 100644 index 000000000..19adf4a46 --- /dev/null +++ b/damocles-worker/src/sealing/sealing_thread/planner/mod.rs @@ -0,0 +1,73 @@ +use crate::{sealing::failure::*, watchdog::Ctx}; +use anyhow::{anyhow, Result}; + +pub const PLANNER_NAME_SEALER: &str = "sealer"; +pub const PLANNER_NAME_SNAPUP: &str = "snapup"; +pub const PLANNER_NAME_REBUILD: &str = "rebuild"; +pub const PLANNER_NAME_UNSEAL: &str = "unseal"; +pub const PLANNER_NAME_WDPOST: &str = "wdpost"; + +mod common; +mod rebuild; +mod sealer; +mod snapup; +mod unseal; +mod wdpost; + +macro_rules! plan { + ($e:expr, $st:expr, $($prev:pat => {$($evt:pat => $next:expr,)+},)*) => { + match $st { + $( + $prev => { + match $e { + $( + $evt => $next, + )+ + _ => return Err(anyhow::anyhow!("unexpected event {:?} for state {:?}", $e, $st)), + } + } + )* + + other => return Err(anyhow::anyhow!("unexpected state {:?}", other)), + } + }; +} + +pub fn default_plan() -> &'static str { + PLANNER_NAME_SEALER +} + +pub(self) use plan; + +use self::{ + common::CommonSealer, rebuild::RebuildPlanner, sealer::SealerPlanner, snapup::SnapUpPlanner, unseal::UnsealPlanner, + wdpost::WdPostSealer, +}; + +use super::{Sealer, SealingThread}; + +pub trait JobTrait { + fn planner(&self) -> &str; +} + +pub trait PlannerTrait: Default { + type Job: JobTrait; + type State; + type Event; + + fn name(&self) -> &str; + fn plan(&self, evt: &Self::Event, st: &Self::State) -> Result; + fn exec(&self, job: &mut Self::Job) -> Result, Failure>; + fn apply(&self, event: Self::Event, state: Self::State, job: &mut Self::Job) -> Result<()>; +} + +pub fn create_selaer(plan: &str, ctx: &Ctx, st: &SealingThread) -> Result> { + match plan { + PLANNER_NAME_SEALER => Ok(Box::new(CommonSealer::::new(ctx, st)?)), + PLANNER_NAME_SNAPUP => Ok(Box::new(CommonSealer::::new(ctx, st)?)), + PLANNER_NAME_REBUILD => Ok(Box::new(CommonSealer::::new(ctx, st)?)), + PLANNER_NAME_UNSEAL => Ok(Box::new(CommonSealer::::new(ctx, st)?)), + PLANNER_NAME_WDPOST => Ok(Box::new(WdPostSealer::new(st.sealing_ctrl(ctx)))), + unknown => Err(anyhow!("unknown planner: {}", unknown)), + } +} diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/rebuild.rs b/damocles-worker/src/sealing/sealing_thread/planner/rebuild.rs similarity index 77% rename from damocles-worker/src/sealing/sealing_thread/task/planner/rebuild.rs rename to damocles-worker/src/sealing/sealing_thread/planner/rebuild.rs index 7881cd032..a23283cdd 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/rebuild.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/rebuild.rs @@ -1,16 +1,26 @@ use anyhow::{anyhow, Context, Result}; use super::{ - super::{call_rpc, field_required, Event, State, Task}, - common, plan, ExecResult, Planner, + super::{call_rpc, field_required}, + common::{self, event::Event, sector::State, task::Task}, + plan, PlannerTrait, PLANNER_NAME_REBUILD, }; use crate::logging::warn; use crate::rpc::sealer::{AllocateSectorSpec, Seed}; use crate::sealing::failure::*; +#[derive(Default)] pub struct RebuildPlanner; -impl Planner for RebuildPlanner { +impl PlannerTrait for RebuildPlanner { + type Job = Task; + type State = State; + type Event = Event; + + fn name(&self) -> &str { + PLANNER_NAME_REBUILD + } + fn plan(&self, evt: &Event, st: &State) -> Result { let next = plan! { evt, @@ -69,7 +79,7 @@ impl Planner for RebuildPlanner { Ok(next) } - fn exec(&self, task: &mut Task<'_>) -> Result, Failure> { + fn exec(&self, task: &mut Task) -> Result, Failure> { let state = task.sector.state; let inner = Rebuild { task }; @@ -106,28 +116,30 @@ impl Planner for RebuildPlanner { other => return Err(anyhow!("unexpected state {:?} in rebuild planner", other).abort()), } - .map(From::from) + .map(Some) + } + + fn apply(&self, event: Event, state: State, task: &mut Task) -> Result<()> { + event.apply(state, task) } } -struct Rebuild<'c, 't> { - task: &'t mut Task<'c>, +struct Rebuild<'t> { + task: &'t mut Task, } -impl<'c, 't> Rebuild<'c, 't> { +impl<'t> Rebuild<'t> { fn is_snapup(&self) -> bool { self.task.sector.finalized.is_some() } - fn empty(&self) -> ExecResult { + fn empty(&self) -> Result { let maybe_res = call_rpc! { - self.task.ctx.global.rpc, - allocate_rebuild_sector, - AllocateSectorSpec { - allowed_miners: Some(self.task.sealing_config.allowed_miners.clone()), - allowed_proof_types: Some(self.task.sealing_config.allowed_proof_types.clone()), + self.task.rpc() => allocate_rebuild_sector(AllocateSectorSpec { + allowed_miners: Some(self.task.sealing_ctrl.config().allowed_miners.clone()), + allowed_proof_types: Some(self.task.sealing_ctrl.config().allowed_proof_types.clone()), }, - }; + )}; let maybe_allocated = match maybe_res { Ok(a) => a, @@ -148,7 +160,7 @@ impl<'c, 't> Rebuild<'c, 't> { Ok(Event::AllocatedRebuildSector(allocated)) } - fn add_pieces_for_sealing(&self) -> ExecResult { + fn add_pieces_for_sealing(&self) -> Result { // if this is a snapup sector, then the deals should be used later let maybe_deals = if self.is_snapup() { None } else { self.task.sector.deals.as_ref() }; @@ -157,21 +169,21 @@ impl<'c, 't> Rebuild<'c, 't> { Ok(Event::AddPiece(pieces)) } - fn build_tree_d_for_sealing(&self) -> ExecResult { + fn build_tree_d_for_sealing(&self) -> Result { common::build_tree_d(self.task, true)?; Ok(Event::BuildTreeD) } - fn pc1(&self) -> ExecResult { + fn pc1(&self) -> Result { let (ticket, out) = common::pre_commit1(self.task)?; Ok(Event::PC1(ticket, out)) } - fn pc2(&self) -> ExecResult { + fn pc2(&self) -> Result { common::pre_commit2(self.task).map(Event::PC2) } - fn check_sealed(&self) -> ExecResult { + fn check_sealed(&self) -> Result { field_required! { ticket, self.task.sector.phases.ticket.as_ref() @@ -185,7 +197,7 @@ impl<'c, 't> Rebuild<'c, 't> { common::commit1_with_seed(self.task, seed).map(|_| Event::CheckSealed) } - fn prepare_for_snapup(&self) -> ExecResult { + fn prepare_for_snapup(&self) -> Result { if !self.is_snapup() { return Ok(Event::SkipSnap); } @@ -195,22 +207,22 @@ impl<'c, 't> Rebuild<'c, 't> { common::add_pieces(self.task, deals).map(Event::AddPiece) } - fn build_tree_d_for_snapup(&self) -> ExecResult { + fn build_tree_d_for_snapup(&self) -> Result { common::build_tree_d(self.task, false).map(|_| Event::BuildTreeD) } - fn snap_encode(&self) -> ExecResult { + fn snap_encode(&self) -> Result { let sector_id = self.task.sector_id()?; let proof_type = self.task.sector_proof_type()?; common::snap_encode(self.task, sector_id, proof_type).map(Event::SnapEncode) } - fn snap_prove(&self) -> ExecResult { + fn snap_prove(&self) -> Result { common::snap_prove(self.task).map(Event::SnapProve) } - fn persist(&self) -> ExecResult { + fn persist(&self) -> Result { let sector_id = self.task.sector_id()?; let (cache_dir, sealed_file) = if self.is_snapup() { @@ -222,7 +234,7 @@ impl<'c, 't> Rebuild<'c, 't> { common::persist_sector_files(self.task, cache_dir, sealed_file).map(Event::Persist) } - fn submit_persist(&self) -> ExecResult { + fn submit_persist(&self) -> Result { common::submit_persisted(self.task, self.is_snapup()).map(|_| Event::SubmitPersistance) } } diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/sealer.rs b/damocles-worker/src/sealing/sealing_thread/planner/sealer.rs similarity index 76% rename from damocles-worker/src/sealing/sealing_thread/task/planner/sealer.rs rename to damocles-worker/src/sealing/sealing_thread/planner/sealer.rs index 1489e26ec..bae0603e4 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/sealer.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/sealer.rs @@ -4,17 +4,27 @@ use anyhow::{anyhow, Context, Result}; use vc_processors::builtin::tasks::STAGE_NAME_C2; use super::{ - super::{call_rpc, cloned_required, field_required, Event, State, Task}, - common, plan, ExecResult, Planner, + super::{call_rpc, cloned_required, field_required}, + common::{self, event::Event, sector::State, task::Task}, + plan, PlannerTrait, PLANNER_NAME_SEALER, }; use crate::logging::{debug, warn}; use crate::rpc::sealer::{AcquireDealsSpec, AllocateSectorSpec, OnChainState, PreCommitOnChainInfo, ProofOnChainInfo, SubmitResult}; use crate::sealing::failure::*; use crate::sealing::processor::{clear_cache, C2Input}; +#[derive(Default)] pub struct SealerPlanner; -impl Planner for SealerPlanner { +impl PlannerTrait for SealerPlanner { + type Job = Task; + type State = State; + type Event = Event; + + fn name(&self) -> &str { + PLANNER_NAME_SEALER + } + fn plan(&self, evt: &Event, st: &State) -> Result { let next = plan! { evt, @@ -90,7 +100,7 @@ impl Planner for SealerPlanner { Ok(next) } - fn exec(&self, task: &mut Task<'_>) -> Result, Failure> { + fn exec(&self, task: &mut Task) -> Result, Failure> { let state = task.sector.state; let inner = Sealer { task }; match state { @@ -134,23 +144,25 @@ impl Planner for SealerPlanner { other => return Err(anyhow!("unexpected state {:?} in sealer planner", other).abort()), } - .map(From::from) + .map(Some) + } + + fn apply(&self, event: Event, state: State, task: &mut Task) -> Result<()> { + event.apply(state, task) } } -struct Sealer<'c, 't> { - task: &'t mut Task<'c>, +struct Sealer<'t> { + task: &'t mut Task, } -impl<'c, 't> Sealer<'c, 't> { - fn handle_empty(&self) -> ExecResult { +impl<'t> Sealer<'t> { + fn handle_empty(&self) -> Result { let maybe_allocated_res = call_rpc! { - self.task.ctx.global.rpc, - allocate_sector, - AllocateSectorSpec { - allowed_miners: Some(self.task.sealing_config.allowed_miners.clone()), - allowed_proof_types: Some(self.task.sealing_config.allowed_proof_types.clone()), - }, + self.task.rpc()=>allocate_sector(AllocateSectorSpec { + allowed_miners: Some(self.task.sealing_ctrl.config().allowed_miners.clone()), + allowed_proof_types: Some(self.task.sealing_ctrl.config().allowed_proof_types.clone()), + },) }; let maybe_allocated = match maybe_allocated_res { @@ -176,9 +188,9 @@ impl<'c, 't> Sealer<'c, 't> { Ok(Event::Allocate(sector)) } - fn handle_allocated(&self) -> ExecResult { - if !self.task.sealing_config.enable_deals { - return Ok(if self.task.sealing_config.disable_cc { + fn handle_allocated(&self) -> Result { + if !self.task.sealing_ctrl.config().enable_deals { + return Ok(if self.task.sealing_ctrl.config().disable_cc { Event::Idle } else { Event::AcquireDeals(None) @@ -188,51 +200,51 @@ impl<'c, 't> Sealer<'c, 't> { let sector_id = self.task.sector_id()?.clone(); let deals = call_rpc! { - self.task.ctx.global.rpc, - acquire_deals, - sector_id, - AcquireDealsSpec { - max_deals: self.task.sealing_config.max_deals, - min_used_space: self.task.sealing_config.min_deal_space.map(|b| b.get_bytes() as usize), - }, + self.task.rpc()=>acquire_deals( + sector_id, + AcquireDealsSpec { + max_deals: self.task.sealing_ctrl.config().max_deals, + min_used_space: self.task.sealing_ctrl.config().min_deal_space.map(|b| b.get_bytes() as usize), + }, + ) }?; let deals_count = deals.as_ref().map(|d| d.len()).unwrap_or(0); debug!(count = deals_count, "pieces acquired"); - Ok(if !self.task.sealing_config.disable_cc || deals_count > 0 { + Ok(if !self.task.sealing_ctrl.config().disable_cc || deals_count > 0 { Event::AcquireDeals(deals) } else { Event::Idle }) } - fn handle_deals_acquired(&self) -> ExecResult { + fn handle_deals_acquired(&self) -> Result { let pieces = common::add_pieces(self.task, self.task.sector.deals.as_ref().unwrap_or(&Vec::new()))?; Ok(Event::AddPiece(pieces)) } - fn handle_piece_added(&self) -> ExecResult { + fn handle_piece_added(&self) -> Result { common::build_tree_d(self.task, true)?; Ok(Event::BuildTreeD) } - fn handle_tree_d_built(&self) -> ExecResult { + fn handle_tree_d_built(&self) -> Result { Ok(Event::AssignTicket(None)) } - fn handle_ticket_assigned(&self) -> ExecResult { + fn handle_ticket_assigned(&self) -> Result { let (ticket, out) = common::pre_commit1(self.task)?; Ok(Event::PC1(ticket, out)) } - fn handle_pc1_done(&self) -> ExecResult { + fn handle_pc1_done(&self) -> Result { common::pre_commit2(self.task).map(Event::PC2) } - fn handle_pc2_done(&self) -> ExecResult { + fn handle_pc2_done(&self) -> Result { field_required! { sector, self.task.sector.base.as_ref().map(|b| b.allocated.clone()) @@ -269,11 +281,7 @@ impl<'c, 't> Sealer<'c, 't> { }; let res = call_rpc! { - self.task.ctx.global.rpc, - submit_pre_commit, - sector, - pinfo, - self.task.sector.phases.pc2_re_submit, + self.task.rpc() => submit_pre_commit(sector, pinfo, self.task.sector.phases.pc2_re_submit,) }?; // TODO: handle submit reset correctly @@ -288,14 +296,12 @@ impl<'c, 't> Sealer<'c, 't> { } } - fn handle_pc_submitted(&self) -> ExecResult { + fn handle_pc_submitted(&self) -> Result { let sector_id = self.task.sector_id()?; 'POLL: loop { let state = call_rpc! { - self.task.ctx.global.rpc, - poll_pre_commit_state, - sector_id.clone(), + self.task.rpc()=>poll_pre_commit_state(sector_id.clone(), ) }?; match state.state { @@ -305,7 +311,7 @@ impl<'c, 't> Sealer<'c, 't> { OnChainState::Failed => { warn!("pre commit on-chain info failed: {:?}", state.desc); // TODO: make it configurable - self.task.wait_or_interrupted(Duration::from_secs(30))?; + self.task.sealing_ctrl.wait_or_interrupted(Duration::from_secs(30))?; return Ok(Event::ReSubmitPC); } @@ -318,11 +324,13 @@ impl<'c, 't> Sealer<'c, 't> { debug!( state = ?state.state, - interval = ?self.task.sealing_config.rpc_polling_interval, + interval = ?self.task.sealing_ctrl.config().rpc_polling_interval, "waiting for next round of polling pre commit state", ); - self.task.wait_or_interrupted(self.task.sealing_config.rpc_polling_interval)?; + self.task + .sealing_ctrl + .wait_or_interrupted(self.task.sealing_ctrl.config().rpc_polling_interval)?; } debug!("pre commit landed"); @@ -330,7 +338,7 @@ impl<'c, 't> Sealer<'c, 't> { Ok(Event::CheckPC) } - fn handle_pc_landed(&self) -> ExecResult { + fn handle_pc_landed(&self) -> Result { let sector_id = self.task.sector_id()?; let cache_dir = self.task.cache_dir(sector_id); let sealed_file = self.task.sealed_file(sector_id); @@ -340,18 +348,16 @@ impl<'c, 't> Sealer<'c, 't> { Ok(Event::Persist(ins_name)) } - fn handle_persisted(&self) -> ExecResult { + fn handle_persisted(&self) -> Result { common::submit_persisted(self.task, false).map(|_| Event::SubmitPersistance) } - fn handle_persistance_submitted(&self) -> ExecResult { + fn handle_persistance_submitted(&self) -> Result { let sector_id = self.task.sector_id()?; let seed = loop { let wait = call_rpc! { - self.task.ctx.global.rpc, - wait_seed, - sector_id.clone(), + self.task.rpc()=>wait_seed(sector_id.clone(), ) }?; if let Some(seed) = wait.seed { @@ -366,13 +372,13 @@ impl<'c, 't> Sealer<'c, 't> { debug!(?delay, "waiting for next round of polling seed"); - self.task.wait_or_interrupted(delay)?; + self.task.sealing_ctrl.wait_or_interrupted(delay)?; }; Ok(Event::AssignSeed(seed)) } - fn handle_seed_assigned(&self) -> ExecResult { + fn handle_seed_assigned(&self) -> Result { cloned_required! { seed, self.task.sector.phases.seed @@ -381,8 +387,8 @@ impl<'c, 't> Sealer<'c, 't> { common::commit1_with_seed(self.task, seed).map(Event::C1) } - fn handle_c1_done(&self) -> ExecResult { - let token = self.task.ctx.global.limit.acquire(STAGE_NAME_C2).crit()?; + fn handle_c1_done(&self) -> Result { + let token = self.task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_C2).crit()?; let miner_id = self.task.sector_id()?.miner; @@ -400,7 +406,8 @@ impl<'c, 't> Sealer<'c, 't> { let out = self .task - .ctx + .sealing_ctrl + .ctx() .global .processors .c2 @@ -416,7 +423,7 @@ impl<'c, 't> Sealer<'c, 't> { Ok(Event::C2(out)) } - fn handle_c2_done(&self) -> ExecResult { + fn handle_c2_done(&self) -> Result { let sector_id = self.task.sector_id()?.clone(); cloned_required! { @@ -427,11 +434,7 @@ impl<'c, 't> Sealer<'c, 't> { let info = ProofOnChainInfo { proof: proof.proof.into() }; let res = call_rpc! { - self.task.ctx.global.rpc, - submit_proof, - sector_id, - info, - self.task.sector.phases.c2_re_submit, + self.task.rpc()=>submit_proof(sector_id, info, self.task.sector.phases.c2_re_submit,) }?; // TODO: submit reset correctly @@ -446,7 +449,7 @@ impl<'c, 't> Sealer<'c, 't> { } } - fn handle_proof_submitted(&self) -> ExecResult { + fn handle_proof_submitted(&self) -> Result { field_required! { allocated, self.task.sector.base.as_ref().map(|b| &b.allocated) @@ -454,12 +457,10 @@ impl<'c, 't> Sealer<'c, 't> { let sector_id = &allocated.id; - if !self.task.sealing_config.ignore_proof_check { + if !self.task.sealing_ctrl.config().ignore_proof_check { 'POLL: loop { let state = call_rpc! { - self.task.ctx.global.rpc, - poll_proof_state, - sector_id.clone(), + self.task.rpc() => poll_proof_state(sector_id.clone(),) }?; match state.state { @@ -469,7 +470,7 @@ impl<'c, 't> Sealer<'c, 't> { OnChainState::Failed => { warn!("proof on-chain info failed: {:?}", state.desc); // TODO: make it configurable - self.task.wait_or_interrupted(Duration::from_secs(30))?; + self.task.sealing_ctrl.wait_or_interrupted(Duration::from_secs(30))?; return Ok(Event::ReSubmitProof); } @@ -482,11 +483,13 @@ impl<'c, 't> Sealer<'c, 't> { debug!( state = ?state.state, - interval = ?self.task.sealing_config.rpc_polling_interval, + interval = ?self.task.sealing_ctrl.config().rpc_polling_interval, "waiting for next round of polling proof state", ); - self.task.wait_or_interrupted(self.task.sealing_config.rpc_polling_interval)?; + self.task + .sealing_ctrl + .wait_or_interrupted(self.task.sealing_ctrl.config().rpc_polling_interval)?; } } diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/snapup.rs b/damocles-worker/src/sealing/sealing_thread/planner/snapup.rs similarity index 80% rename from damocles-worker/src/sealing/sealing_thread/task/planner/snapup.rs rename to damocles-worker/src/sealing/sealing_thread/planner/snapup.rs index c9eac1675..79e3a163f 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/snapup.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/snapup.rs @@ -3,8 +3,14 @@ use std::collections::HashMap; use anyhow::{anyhow, Context, Result}; use super::{ - super::{call_rpc, cloned_required, field_required, Finalized}, - common, plan, Event, ExecResult, Planner, State, Task, + super::{call_rpc, cloned_required, field_required}, + common::{ + self, + event::Event, + sector::{Finalized, State}, + task::Task, + }, + plan, PlannerTrait, PLANNER_NAME_SNAPUP, }; use crate::logging::{debug, warn}; use crate::rpc::sealer::{AcquireDealsSpec, AllocateSectorSpec, AllocateSnapUpSpec, SnapUpOnChainInfo, SubmitResult}; @@ -13,9 +19,18 @@ use crate::sealing::processor::{ cached_filenames_for_sector, TransferInput, TransferItem, TransferOption, TransferRoute, TransferStoreInfo, }; +#[derive(Default)] pub struct SnapUpPlanner; -impl Planner for SnapUpPlanner { +impl PlannerTrait for SnapUpPlanner { + type Job = Task; + type State = State; + type Event = Event; + + fn name(&self) -> &str { + PLANNER_NAME_SNAPUP + } + fn plan(&self, evt: &Event, st: &State) -> Result { let next = plan! { evt, @@ -54,7 +69,7 @@ impl Planner for SnapUpPlanner { Ok(next) } - fn exec(&self, task: &mut Task<'_>) -> Result, Failure> { + fn exec(&self, task: &mut Task) -> Result, Failure> { let state = task.sector.state; let inner = SnapUp { task }; match state { @@ -80,30 +95,32 @@ impl Planner for SnapUpPlanner { other => return Err(anyhow!("unexpected state {:?} in snapup planner", other).abort()), } - .map(From::from) + .map(Some) + } + + fn apply(&self, event: Event, state: State, task: &mut Task) -> Result<()> { + event.apply(state, task) } } -struct SnapUp<'c, 't> { - task: &'t mut Task<'c>, +struct SnapUp<'t> { + task: &'t mut Task, } -impl<'c, 't> SnapUp<'c, 't> { - fn empty(&self) -> ExecResult { +impl<'t> SnapUp<'t> { + fn empty(&self) -> Result { let maybe_res = call_rpc! { - self.task.ctx.global.rpc, - allocate_snapup_sector, - AllocateSnapUpSpec { + self.task.rpc() => allocate_snapup_sector(AllocateSnapUpSpec { sector: AllocateSectorSpec { - allowed_miners: Some(self.task.sealing_config.allowed_miners.clone()), - allowed_proof_types: Some(self.task.sealing_config.allowed_proof_types.clone()), + allowed_miners: Some(self.task.sealing_ctrl.config().allowed_miners.clone()), + allowed_proof_types: Some(self.task.sealing_ctrl.config().allowed_proof_types.clone()), }, deals: AcquireDealsSpec { - max_deals: self.task.sealing_config.max_deals, - min_used_space: self.task.sealing_config.min_deal_space.map(|b| b.get_bytes() as usize), + max_deals: self.task.sealing_ctrl.config().max_deals, + min_used_space: self.task.sealing_ctrl.config().min_deal_space.map(|b| b.get_bytes() as usize), }, }, - }; + )}; let maybe_allocated = match maybe_res { Ok(a) => a, @@ -136,7 +153,7 @@ impl<'c, 't> SnapUp<'c, 't> { )) } - fn add_piece(&self) -> ExecResult { + fn add_piece(&self) -> Result { field_required!(deals, self.task.sector.deals.as_ref()); let pieces = common::add_pieces(self.task, deals)?; @@ -144,12 +161,12 @@ impl<'c, 't> SnapUp<'c, 't> { Ok(Event::AddPiece(pieces)) } - fn build_tree_d(&self) -> ExecResult { + fn build_tree_d(&self) -> Result { common::build_tree_d(self.task, false)?; Ok(Event::BuildTreeD) } - fn snap_encode(&self) -> ExecResult { + fn snap_encode(&self) -> Result { let sector_id = self.task.sector_id()?; let proof_type = self.task.sector_proof_type()?; field_required!( @@ -160,7 +177,8 @@ impl<'c, 't> SnapUp<'c, 't> { debug!("find access store named {}", access_instance); let access_store = self .task - .ctx + .sealing_ctrl + .ctx() .global .attached .get(access_instance) @@ -169,9 +187,7 @@ impl<'c, 't> SnapUp<'c, 't> { debug!("get basic info for access store named {}", access_instance); let access_store_basic_info = call_rpc! { - self.task.ctx.global.rpc, - store_basic_info, - access_instance.clone(), + self.task.rpc() => store_basic_info(access_instance.clone(),) }? .with_context(|| format!("get basic info for store named {}", access_instance)) .perm()?; @@ -241,7 +257,8 @@ impl<'c, 't> SnapUp<'c, 't> { }; self.task - .ctx + .sealing_ctrl + .ctx() .global .processors .transfer @@ -252,11 +269,11 @@ impl<'c, 't> SnapUp<'c, 't> { common::snap_encode(self.task, sector_id, proof_type).map(Event::SnapEncode) } - fn snap_prove(&self) -> ExecResult { + fn snap_prove(&self) -> Result { common::snap_prove(self.task).map(Event::SnapProve) } - fn persist(&self) -> ExecResult { + fn persist(&self) -> Result { let sector_id = self.task.sector_id()?; let update_cache_dir = self.task.update_cache_dir(sector_id); let update_file = self.task.update_file(sector_id); @@ -266,7 +283,7 @@ impl<'c, 't> SnapUp<'c, 't> { Ok(Event::Persist(ins_name)) } - fn submit(&self) -> ExecResult { + fn submit(&self) -> Result { let sector_id = self.task.sector_id()?; field_required!(proof, self.task.sector.phases.snap_prov_out.as_ref()); field_required!(deals, self.task.sector.deals.as_ref()); @@ -275,16 +292,16 @@ impl<'c, 't> SnapUp<'c, 't> { let piece_cids = deals.iter().map(|d| d.piece.cid.clone()).collect(); let res = call_rpc! { - self.task.ctx.global.rpc, - submit_snapup_proof, - sector_id.clone(), - SnapUpOnChainInfo { - comm_r: encode_out.comm_r_new, - comm_d: encode_out.comm_d_new, - access_instance: instance, - pieces: piece_cids, - proof: proof.into(), - }, + self.task.rpc()=>submit_snapup_proof( + sector_id.clone(), + SnapUpOnChainInfo { + comm_r: encode_out.comm_r_new, + comm_d: encode_out.comm_d_new, + access_instance: instance, + pieces: piece_cids, + proof: proof.into(), + }, + ) }?; match res.res { diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/unseal.rs b/damocles-worker/src/sealing/sealing_thread/planner/unseal.rs similarity index 66% rename from damocles-worker/src/sealing/sealing_thread/task/planner/unseal.rs rename to damocles-worker/src/sealing/sealing_thread/planner/unseal.rs index 4cd13986f..08e1c205c 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/unseal.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/unseal.rs @@ -1,6 +1,7 @@ use super::{ - super::{call_rpc, field_required, Event, State, Task}, - plan, ExecResult, Planner, + super::{call_rpc, field_required}, + common::{event::Event, sector::State, task::Task}, + plan, PlannerTrait, PLANNER_NAME_UNSEAL, }; use crate::logging::warn; use crate::rpc::sealer::AllocateSectorSpec; @@ -20,11 +21,22 @@ use vc_processors::{ fil_proofs::{UnpaddedByteIndex, UnpaddedBytesAmount}, }; -use crate::sealing::processor::{TransferInput, TransferItem, TransferRoute, TransferStoreInfo}; +use crate::sealing::processor::{ + cached_filenames_for_sector, TransferInput, TransferItem, TransferOption, TransferRoute, TransferStoreInfo, +}; +#[derive(Default)] pub struct UnsealPlanner; -impl Planner for UnsealPlanner { +impl PlannerTrait for UnsealPlanner { + type Job = Task; + type State = State; + type Event = Event; + + fn name(&self) -> &str { + PLANNER_NAME_UNSEAL + } + fn plan(&self, evt: &Event, st: &State) -> Result { let next = plan! { evt, @@ -34,6 +46,9 @@ impl Planner for UnsealPlanner { Event::AllocatedUnsealSector(_) => State::Allocated, }, State::Allocated => { + Event::UnsealReady => State::UnsealPrepared, + }, + State::UnsealPrepared => { Event::UnsealDone(_) => State::Unsealed, }, State::Unsealed => { @@ -44,37 +59,40 @@ impl Planner for UnsealPlanner { Ok(next) } - fn exec(&self, task: &mut Task<'_>) -> Result, Failure> { + fn exec(&self, task: &mut Task) -> Result, Failure> { let state = task.sector.state; let inner = Unseal { task }; match state { State::Empty => inner.acquire_task(), - State::Allocated => inner.unseal(), + State::Allocated => inner.pre_unseal(), + State::UnsealPrepared => inner.unseal(), State::Unsealed => inner.upload_piece(), State::Finished => return Ok(None), other => Err(anyhow!("unexpected state: {:?} in unseal planner", other).abort()), } - .map(From::from) + .map(Some) + } + + fn apply(&self, event: Event, state: State, task: &mut Task) -> Result<()> { + event.apply(state, task) } } // empty -> acquire -> unseal -> upload -> finish -struct Unseal<'c, 't> { - task: &'t mut Task<'c>, +struct Unseal<'t> { + task: &'t mut Task, } -impl<'c, 't> Unseal<'c, 't> { - fn acquire_task(&self) -> ExecResult { +impl<'t> Unseal<'t> { + fn acquire_task(&self) -> Result { let maybe_res = call_rpc! { - self.task.ctx.global.rpc, - allocate_unseal_sector, - AllocateSectorSpec { - allowed_miners: Some(self.task.sealing_config.allowed_miners.clone()), - allowed_proof_types: Some(self.task.sealing_config.allowed_proof_types.clone()), - }, + self.task.rpc()=>allocate_unseal_sector(AllocateSectorSpec { + allowed_miners: Some(self.task.sealing_ctrl.config().allowed_miners.clone()), + allowed_proof_types: Some(self.task.sealing_ctrl.config().allowed_proof_types.clone()), + },) }; let maybe_allocated = match maybe_res { @@ -96,44 +114,124 @@ impl<'c, 't> Unseal<'c, 't> { Ok(Event::AllocatedUnsealSector(allocated)) } - fn unseal(&self) -> ExecResult { - // query token - let _token = self.task.ctx.global.limit.acquire(STAGE_NAME_UNSEAL).crit()?; + fn pre_unseal(&self) -> Result { + let _token = self.task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_TRANSFER).crit()?; + // persist store -> thread store let sector_id = self.task.sector_id()?; let proof_type = self.task.sector_proof_type()?; - - field_required!(unseal_info, self.task.sector.phases.unseal_in.as_ref()); field_required!( - instance_name, + access_instance, self.task.sector.finalized.as_ref().map(|f| &f.private.access_instance) ); - debug!("find access store named {}", instance_name); - let instance = self + debug!("find access store named {}", access_instance); + let access_store = self .task - .ctx + .sealing_ctrl + .ctx() .global .attached - .get(instance_name) - .with_context(|| format!("get access store instance named {}", instance_name)) + .get(access_instance) + .with_context(|| format!("get access store instance named {}", access_instance)) .perm()?; - let sealed_temp = self.task.sealed_file(sector_id); - let sealed_rel = sealed_temp.rel(); - - let cache_temp = self.task.cache_dir(sector_id); - let cache_rel = cache_temp.rel(); + debug!("get basic info for access store named {}", access_instance); + let access_store_basic_info = call_rpc! { + self.task.rpc() => store_basic_info(access_instance.clone(),) + }? + .with_context(|| format!("get basic info for store named {}", access_instance)) + .perm()?; + + // sealed file & persisted cache files should be accessed inside persist store + let sealed_file = self.task.sealed_file(sector_id); + sealed_file.prepare().perm()?; + let sealed_rel = sealed_file.rel(); + + let cache_dir = self.task.cache_dir(sector_id); + + let cached_file_routes = cached_filenames_for_sector(proof_type.into()) + .into_iter() + .map(|fname| { + let cached_file = cache_dir.join(fname); + let cached_rel = cached_file.rel(); + + Ok(TransferRoute { + src: TransferItem { + store_name: Some(access_instance.clone()), + uri: access_store + .uri(cached_rel) + .with_context(|| format!("get uri for cache dir {:?} in {}", cached_rel, access_instance)) + .perm()?, + }, + dest: TransferItem { + store_name: None, + uri: cached_file.full().clone(), + }, + opt: Some(TransferOption { + is_dir: false, + allow_link: true, + }), + }) + }) + .collect::, Failure>>()?; + + let mut transfer_routes = vec![TransferRoute { + src: TransferItem { + store_name: Some(access_instance.clone()), + uri: access_store + .uri(sealed_rel) + .with_context(|| format!("get uri for sealed file {:?} in {}", sealed_rel, access_instance)) + .perm()?, + }, + dest: TransferItem { + store_name: None, + uri: sealed_file.full().clone(), + }, + opt: Some(TransferOption { + is_dir: false, + allow_link: true, + }), + }]; + + transfer_routes.extend(cached_file_routes.into_iter()); + + let transfer = TransferInput { + stores: HashMap::from_iter([( + access_instance.clone(), + TransferStoreInfo { + name: access_instance.clone(), + meta: access_store_basic_info.meta, + }, + )]), + routes: transfer_routes, + }; - let sealed_path = instance - .uri(sealed_rel) - .with_context(|| format!("get uri for sealed file {:?} in {}", sealed_rel, instance_name)) - .perm()?; - let cache_path = instance - .uri(cache_rel) - .with_context(|| format!("get uri for cache file {:?} in {}", cache_rel, instance_name)) + self.task + .sealing_ctrl + .ctx() + .global + .processors + .transfer + .process(transfer) + .context("link unseal sector files") .perm()?; + Ok(Event::UnsealReady) + } + + fn unseal(&self) -> Result { + // query token + let _token = self.task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_UNSEAL).crit()?; + + let sector_id = self.task.sector_id()?; + let proof_type = self.task.sector_proof_type()?; + + field_required!(unseal_info, self.task.sector.phases.unseal_in.as_ref()); + + let cache_dir = self.task.cache_dir(sector_id); + let sealed_file = self.task.sealed_file(sector_id); + let piece_file = self.task.piece_file(&unseal_info.piece_cid); if piece_file.full().exists() { remove_file(&piece_file).context("remove the existing piece file").perm()?; @@ -152,7 +250,8 @@ impl<'c, 't> Unseal<'c, 't> { // call unseal fn let out = self .task - .ctx + .sealing_ctrl + .ctx() .global .processors .unseal @@ -162,8 +261,8 @@ impl<'c, 't> Unseal<'c, 't> { sector_id, comm_d: unseal_info.comm_d, ticket: ticket.ticket.0, - cache_dir: cache_path, - sealed_file: sealed_path, + cache_dir: cache_dir.into(), + sealed_file: sealed_file.into(), unsealed_output: piece_file.into(), offset: UnpaddedByteIndex(unseal_info.offset), num_bytes: UnpaddedBytesAmount(unseal_info.size), @@ -179,8 +278,8 @@ impl<'c, 't> Unseal<'c, 't> { Ok(Event::UnsealDone(out.0)) } - fn upload_piece(&self) -> ExecResult { - let _token = self.task.ctx.global.limit.acquire(STAGE_NAME_TRANSFER).crit()?; + fn upload_piece(&self) -> Result { + let _token = self.task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_TRANSFER).crit()?; let sector_id = self.task.sector_id()?; @@ -190,10 +289,7 @@ impl<'c, 't> Unseal<'c, 't> { // parse dest let dests = call_rpc! { - self.task.ctx.global.rpc, - acquire_unseal_dest, - sector_id.clone(), - unseal_info.piece_cid.clone(), + self.task.rpc()=>acquire_unseal_dest(sector_id.clone(), unseal_info.piece_cid.clone(),) }?; if !dests.is_empty() { @@ -279,7 +375,8 @@ impl<'c, 't> Unseal<'c, 't> { let ins_name = ins_name.to_string(); let access_store = self .task - .ctx + .sealing_ctrl + .ctx() .global .attached .get(&ins_name) @@ -288,9 +385,7 @@ impl<'c, 't> Unseal<'c, 't> { debug!("get basic info for access store named {}", ins_name); let access_store_basic_info = call_rpc! { - self.task.ctx.global.rpc, - store_basic_info, - ins_name.clone(), + self.task.rpc()=>store_basic_info(ins_name.clone(),) }? .with_context(|| format!("get basic info for store named {}", ins_name)) .perm()?; @@ -319,7 +414,8 @@ impl<'c, 't> Unseal<'c, 't> { }; self.task - .ctx + .sealing_ctrl + .ctx() .global .processors .transfer @@ -329,7 +425,7 @@ impl<'c, 't> Unseal<'c, 't> { } None => { // use remote piece store by default - let access_store = &self.task.ctx.global.remote_piece_store; + let access_store = &self.task.sealing_ctrl.ctx().global.remote_piece_store; let p = p.trim_matches('/'); let piece_cid = Cid::try_from(p).context(format!("parse cid {}", p)).perm()?; let url = match access_store.get(&piece_cid).unwrap() { @@ -357,11 +453,7 @@ impl<'c, 't> Unseal<'c, 't> { } call_rpc! { - self.task.ctx.global.rpc, - achieve_unseal_sector, - sector_id.clone(), - unseal_info.piece_cid.clone(), - "".to_string(), + self.task.rpc()=>achieve_unseal_sector(sector_id.clone(), unseal_info.piece_cid.clone(), "".to_string(),) }?; Ok(Event::UploadPieceDone) diff --git a/damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs b/damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs new file mode 100644 index 000000000..911ed78dd --- /dev/null +++ b/damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs @@ -0,0 +1,473 @@ +use std::collections::HashMap; +use std::fmt::Display; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{anyhow, Context, Result}; +use crossbeam_channel::{bounded, Receiver, Sender}; +use jsonrpc_core::ErrorCode; +use jsonrpc_core_client::RpcError; +use tokio::runtime::Handle; +use vc_processors::builtin::tasks::{PoStReplicaInfo, WindowPoSt, WindowPoStOutput}; + +use crate::logging::warn; +use crate::rpc::sealer::{AllocatePoStSpec, AllocatedWdPoStJob, SectorID}; +use crate::sealing::failure::*; +use crate::sealing::paths; +use crate::sealing::sealing_thread::{planner::plan, Sealer, SealingCtrl, R}; + +use super::super::call_rpc; +use super::{JobTrait, PlannerTrait, PLANNER_NAME_WDPOST}; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum WdPostState { + Empty, + Allocated, + Generated, + Finished, + Aborted, +} + +impl WdPostState { + pub fn from_str(s: &str) -> Option { + Some(match s { + "Empty" => Self::Empty, + "Allocated" => Self::Allocated, + "Generated" => Self::Generated, + "Finished" => Self::Finished, + "Aborted" => Self::Aborted, + _ => return None, + }) + } +} + +impl Display for WdPostState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + WdPostState::Empty => "Empty", + WdPostState::Allocated => "Allocated", + WdPostState::Generated => "Generated", + WdPostState::Finished => "Finished", + WdPostState::Aborted => "Aborted", + } + ) + } +} + +impl Default for WdPostState { + fn default() -> Self { + Self::Empty + } +} + +#[derive(Clone, Debug)] +pub enum WdPostEvent { + Idle, + #[allow(dead_code)] + Retry, + SetState(WdPostState), + Allocated { + allocated: AllocatedWdPoStJob, + stop_heartbeat_tx: Sender<()>, + }, + Generate(Result), + Finish, +} + +pub struct WdPostSealer { + job: WdPostJob, + planner: WdPostPlanner, + retry: u32, +} + +impl WdPostSealer { + pub fn new(ctrl: SealingCtrl<'static>) -> Self { + Self { + job: WdPostJob::new(ctrl), + planner: WdPostPlanner, + retry: 0, + } + } +} + +impl Sealer for WdPostSealer { + fn seal(&mut self, state: Option<&str>) -> Result { + let mut event = state.and_then(WdPostState::from_str).map(WdPostEvent::SetState); + if let (true, Some(s)) = (event.is_none(), state) { + tracing::error!("unknown state: {}", s); + } + + loop { + self.job.sealing_ctrl.interrupted()?; + + if self.planner.name() != self.job.planner() { + // switch planner + return Ok(R::SwitchPlanner(self.job.planner().to_string())); + } + + if let Some(evt) = event.take() { + match evt { + WdPostEvent::Idle | WdPostEvent::Retry => { + let recover_interval = self.job.sealing_ctrl.config().recover_interval; + tracing::debug!( + sleep = ?recover_interval, + "Event::{:?} captured", evt + ); + + self.job.sealing_ctrl.wait_or_interrupted(recover_interval)?; + } + + _ => { + let state = self.planner.plan(&evt, &self.job.state).crit()?; + self.planner.apply(evt, state, &mut self.job).context("event apply").crit()?; + } + }; + }; + + let span = tracing::warn_span!("handle", current = ?self.job.state); + + let _enter = span.enter(); + self.job + .sealing_ctrl + .ctrl_ctx() + .update_state(|cst| { + let _ = cst.job.state.replace(self.job.state.to_string()); + cst.job.id = self.job.wdpost_job.as_ref().map(|t| t.id.to_owned()); + }) + .crit()?; + + tracing::debug!("handling"); + + let res = self.planner.exec(&mut self.job); + + match res { + Ok(Some(evt)) => { + event.replace(evt); + } + Ok(None) => return Ok(R::Done), + Err(Failure(Level::Temporary, terr)) => { + if self.retry >= self.job.sealing_ctrl.config().max_retries { + // reset retry times; + self.retry = 0; + return Err(terr.abort()); + } + tracing::warn!(retry = self.retry, "temp error occurred: {:?}", terr); + self.retry += 1; + tracing::info!( + interval = ?self.job.sealing_ctrl.config().recover_interval, + "wait before recovering" + ); + + self.job + .sealing_ctrl + .wait_or_interrupted(self.job.sealing_ctrl.config().recover_interval)?; + } + + Err(f) => return Err(f), + } + } + } +} + +#[derive(Clone)] +pub struct WdPostJob { + sealing_ctrl: SealingCtrl<'static>, + + state: WdPostState, + wdpost_job: Option, + wdpost_job_result: Option>, + + stop_heartbeat_tx: Option>, +} + +impl JobTrait for WdPostJob { + fn planner(&self) -> &str { + self.sealing_ctrl.config().plan() + } +} + +impl WdPostJob { + fn new(sealing_ctrl: SealingCtrl<'static>) -> Self { + WdPostJob { + sealing_ctrl, + state: WdPostState::default(), + wdpost_job: None, + wdpost_job_result: None, + stop_heartbeat_tx: None, + } + } +} + +#[derive(Default)] +pub struct WdPostPlanner; + +impl PlannerTrait for WdPostPlanner { + type Job = WdPostJob; + type State = WdPostState; + type Event = WdPostEvent; + + fn name(&self) -> &str { + PLANNER_NAME_WDPOST + } + + fn plan(&self, evt: &Self::Event, st: &Self::State) -> Result { + let next = plan! { + evt, + st, + + WdPostState::Empty => { + // alloc wdpost job + WdPostEvent::Allocated{ .. } => WdPostState::Allocated, + }, + WdPostState::Allocated => { + // gen prove + WdPostEvent::Generate(_) => WdPostState::Generated, + }, + WdPostState::Generated => { + WdPostEvent::Finish => WdPostState::Finished, + }, + }; + + tracing::debug!("wdpost plan: {} -> {}", st, next); + + Ok(next) + } + + fn exec(&self, job: &mut Self::Job) -> Result, Failure> { + let inner = WdPost { job }; + + match &inner.job.state { + WdPostState::Empty => inner.acquire(), + WdPostState::Allocated => inner.generate(), + WdPostState::Generated => inner.report_result(), + WdPostState::Finished => return Ok(None), + WdPostState::Aborted => return Err(TaskAborted.into()), + } + .map(Some) + } + + fn apply(&self, event: Self::Event, state: Self::State, job: &mut Self::Job) -> Result<()> { + let next = if let WdPostEvent::SetState(s) = event { s } else { state }; + + if next == job.state { + return Err(anyhow!("state unchanged, may enter an infinite loop")); + } + + match event { + WdPostEvent::Idle => {} + WdPostEvent::SetState(_) => {} + WdPostEvent::Allocated { + allocated, + stop_heartbeat_tx, + } => { + job.wdpost_job = Some(allocated); + job.stop_heartbeat_tx = Some(stop_heartbeat_tx) + } + WdPostEvent::Generate(result) => { + job.wdpost_job_result = Some(result); + } + WdPostEvent::Finish => {} + WdPostEvent::Retry => {} + } + tracing::debug!("apply state: {}", next); + job.state = next; + + Ok(()) + } +} + +struct WdPost<'a> { + job: &'a mut WdPostJob, +} + +impl WdPost<'_> { + fn acquire(&self) -> Result { + let res = call_rpc!(raw, + self.job.sealing_ctrl.ctx().global.rpc =>allocate_wdpost_job( + AllocatePoStSpec { + allowed_miners: Some(self.job.sealing_ctrl.config().allowed_miners.clone()), + allowed_proof_types: Some( + self.job + .sealing_ctrl + .config() + .allowed_proof_types + .iter() + .flat_map(|x| x.to_post_proofs()) + .collect() + ), + }, + 1, + self.job.sealing_ctrl.ctx().instance.clone(), + ) + ); + + let mut allocated = match res { + Ok(a) => a, + Err(RpcError::JsonRpcError(e)) if e.code == ErrorCode::MethodNotFound => { + warn!("damocles-manager may not have enabled the worker-prover module. Please enable the worker-prover module first."); + return Ok(WdPostEvent::Idle); + } + Err(e) => { + warn!(err=?e, "window PoSt job is not allocated yet, so we can retry even though we got error."); + return Ok(WdPostEvent::Idle); + } + }; + + tracing::debug!(allocated = allocated.len(), "allocated"); + + if allocated.is_empty() { + return Ok(WdPostEvent::Idle); + } + + let allocated = allocated.swap_remove(0); + let (stop_heartbeat_tx, stop_heartbeat_rx) = bounded(0); + Self::start_heartbeat( + self.job.sealing_ctrl.ctx().global.rpc.clone(), + allocated.id.clone(), + self.job.sealing_ctrl.ctx().instance.clone(), + stop_heartbeat_rx, + ); + Ok(WdPostEvent::Allocated { + allocated, + stop_heartbeat_tx, + }) + } + + fn generate(&self) -> Result { + let wdpost_job = self.job.wdpost_job.as_ref().context("wdpost info not found").abort()?; + + let mut instances = HashMap::new(); + for access in wdpost_job + .input + .sectors + .iter() + .flat_map(|x| [&x.accesses.cache_dir, &x.accesses.sealed_file]) + { + if let std::collections::hash_map::Entry::Vacant(e) = instances.entry(access) { + let instance = self + .job + .sealing_ctrl + .ctx() + .global + .attached + .get(access) + .with_context(|| format!("get access store instance named {}", access)) + .abort()?; + e.insert(instance); + } + } + + // get sealed path and cache path + let replica = wdpost_job + .input + .sectors + .iter() + .map(|sector| { + let sector_id = &SectorID { + miner: wdpost_job.input.miner_id, + number: sector.sector_id.into(), + }; + + let sealed_file = if sector.upgrade { + paths::update_file(sector_id) + } else { + paths::sealed_file(sector_id) + }; + let sealed_path = instances[§or.accesses.sealed_file].uri(&sealed_file).with_context(|| { + format!( + "get uri for sealed file {} in {}", + sealed_file.display(), + sector.accesses.sealed_file + ) + })?; + let cache_dir = if sector.upgrade { + paths::update_cache_dir(sector_id) + } else { + paths::cache_dir(sector_id) + }; + let cache_path = instances[§or.accesses.cache_dir] + .uri(&cache_dir) + .with_context(|| format!("get uri for cache file {} in {}", cache_dir.display(), sector.accesses.cache_dir))?; + + let sector_id = sector.sector_id; + let replica = PoStReplicaInfo { + sector_id, + comm_r: sector.comm_r, + cache_dir: cache_path, + sealed_file: sealed_path, + }; + Ok(replica) + }) + .collect::>>() + .abort()?; + + let post_in = WindowPoSt { + miner_id: wdpost_job.input.miner_id, + proof_type: wdpost_job.input.proof_type, + replicas: replica, + seed: wdpost_job.input.seed, + }; + let res = self.job.sealing_ctrl.ctx().global.processors.wdpost.process(post_in); + if let Err(e) = &res { + tracing::error!(err=?e, job_id=wdpost_job.id,"wdpost error"); + } + Ok(WdPostEvent::Generate(res.map_err(|e| e.to_string()))) + } + + fn report_result(&self) -> Result { + let job_id = self + .job + .wdpost_job + .as_ref() + .context("wdpost job cannot be empty") + .abort()? + .id + .clone(); + let result = self + .job + .wdpost_job_result + .as_ref() + .context("wdpost job result cannot be empty") + .abort()?; + + let (out, error_reason) = match result { + Ok(out) => (Some(out.clone()), String::new()), + Err(err) => (None, err.to_string()), + }; + + call_rpc!(self.job.sealing_ctrl.ctx().global.rpc => wdpost_finish(job_id, out, error_reason,))?; + if let Some(tx) = &self.job.stop_heartbeat_tx { + let _ = tx.send(()); + } + Ok(WdPostEvent::Finish) + } + + fn start_heartbeat(rpc: Arc, job_id: String, worker_name: String, stop_rx: Receiver<()>) { + let handle = Handle::current(); + std::thread::spawn(move || loop { + let _guard = handle.enter(); + + crossbeam_channel::select! { + recv(stop_rx) -> _ => break, + default(Duration::from_secs(3)) => { + let worker_name = worker_name.clone(); + let job_ids = vec![job_id.clone()]; + let res = call_rpc!(raw, rpc => wdpost_heartbeat(job_ids, worker_name,)); + match res { + Ok(_) => {} + Err(RpcError::JsonRpcError(e)) if e.code == ErrorCode::MethodNotFound => { + warn!(err=?e, "damocles-manager may not have enabled the worker-prover module. Please enable the worker-prover module first."); + } + Err(e) => { + warn!(err=?e, job_id=job_id, "failed to send heartbeat") + } + } + tracing::debug!(job_id = job_id, "send heartbeat"); + } + } + }); + } +} diff --git a/damocles-worker/src/sealing/sealing_thread/task/mod.rs b/damocles-worker/src/sealing/sealing_thread/task/mod.rs deleted file mode 100644 index 049326744..000000000 --- a/damocles-worker/src/sealing/sealing_thread/task/mod.rs +++ /dev/null @@ -1,421 +0,0 @@ -use std::path::PathBuf; -use std::time::Duration; - -use anyhow::{Context, Result}; -use crossbeam_channel::select; -use forest_cid::json::CidJson; - -pub use self::planner::default_plan; - -use super::{super::failure::*, CtrlCtx}; -use crate::logging::{debug, error, info, warn, warn_span}; -use crate::metadb::{rocks::RocksMeta, MaybeDirty, MetaDocumentDB, PrefixedMetaDB, Saved}; -use crate::rpc::sealer::{ReportStateReq, SectorFailure, SectorID, SectorStateChange, WorkerIdentifier}; -use crate::sealing::config::Config; -use crate::store::Store; -use crate::types::SealProof; -use crate::watchdog::Ctx; - -pub mod event; -use event::*; - -pub mod sector; -use sector::*; - -mod planner; -use planner::{get_planner, Planner}; - -mod entry; -use entry::*; - -#[macro_use] -mod util; -use util::*; - -const SECTOR_INFO_KEY: &str = "info"; -const SECTOR_META_PREFIX: &str = "meta"; -const SECTOR_TRACE_PREFIX: &str = "trace"; - -pub struct Task<'c> { - sector: Saved>, - _trace: Vec, - - ctx: &'c Ctx, - ctrl_ctx: &'c CtrlCtx, - sealing_config: &'c Config, - store: &'c Store, - ident: WorkerIdentifier, - - _trace_meta: MetaDocumentDB>, -} - -// properties -impl<'c> Task<'c> { - fn sector_id(&self) -> Result<&SectorID, Failure> { - field_required! { - sector_id, - self.sector.base.as_ref().map(|b| &b.allocated.id) - } - - Ok(sector_id) - } - - fn sector_proof_type(&self) -> Result<&SealProof, Failure> { - field_required! { - proof_type, - self.sector.base.as_ref().map(|b| &b.allocated.proof_type) - } - - Ok(proof_type) - } -} - -// public methods -impl<'c> Task<'c> { - pub fn build(ctx: &'c Ctx, ctrl_ctx: &'c CtrlCtx, sealing_config: &'c mut Config, s: &'c mut Store) -> Result { - sealing_config - .reload_if_needed(|_, _| Ok(true)) - .context("reload sealing thread hot config") - .crit()?; - - let sector_meta = PrefixedMetaDB::wrap(SECTOR_META_PREFIX, &s.meta); - - let mut sector: Saved = Saved::load(SECTOR_INFO_KEY, sector_meta, || Sector::new(sealing_config.plan().to_string())) - .context("load sector") - .crit()?; - sector.sync().context("init sync sector").crit()?; - - ctrl_ctx - .update_state(|cst| cst.job.plan = sector.plan.clone().unwrap_or_else(|| default_plan().to_owned())) - .context("update ctrl state") - .perm()?; - - let trace_meta = MetaDocumentDB::wrap(PrefixedMetaDB::wrap(SECTOR_TRACE_PREFIX, &s.meta)); - - Ok(Task { - sector, - _trace: Vec::with_capacity(16), - - ctx, - ctrl_ctx, - sealing_config, - store: s, - ident: WorkerIdentifier { - instance: ctx.instance.clone(), - location: s.location.to_pathbuf(), - }, - - _trace_meta: trace_meta, - }) - } - - fn report_state(&self, state_change: SectorStateChange, fail: Option) -> Result<(), Failure> { - let sector_id = match self.sector.base.as_ref().map(|base| base.allocated.id.clone()) { - Some(sid) => sid, - None => return Ok(()), - }; - - call_rpc! { - self.ctx.global.rpc, - report_state, - sector_id, - ReportStateReq { - worker: self.ident.clone(), - state_change, - failure: fail, - }, - }?; - - Ok(()) - } - - fn report_finalized(&self) -> Result<(), Failure> { - let sector_id = match self.sector.base.as_ref().map(|base| base.allocated.id.clone()) { - Some(sid) => sid, - None => return Ok(()), - }; - - call_rpc! { - self.ctx.global.rpc, - report_finalized, - sector_id, - }?; - - Ok(()) - } - - fn report_aborted(&self, reason: String) -> Result<(), Failure> { - let sector_id = match self.sector.base.as_ref().map(|base| base.allocated.id.clone()) { - Some(sid) => sid, - None => return Ok(()), - }; - - call_rpc! { - self.ctx.global.rpc, - report_aborted, - sector_id, - reason, - }?; - - Ok(()) - } - - fn interrupted(&self) -> Result<(), Failure> { - select! { - recv(self.ctx.done) -> _done_res => { - Err(Interrupt.into()) - } - - recv(self.ctrl_ctx.pause_rx) -> pause_res => { - pause_res.context("pause signal channel closed unexpectedly").crit()?; - Err(Interrupt.into()) - } - - default => { - Ok(()) - } - } - } - - fn wait_or_interrupted(&self, duration: Duration) -> Result<(), Failure> { - select! { - recv(self.ctx.done) -> _done_res => { - Err(Interrupt.into()) - } - - recv(self.ctrl_ctx.pause_rx) -> pause_res => { - pause_res.context("pause signal channel closed unexpectedly").crit()?; - Err(Interrupt.into()) - } - - default(duration) => { - Ok(()) - } - } - } - - pub fn exec(mut self, state: Option) -> Result<(), Failure> { - let mut event = state.map(Event::SetState); - let mut task_idle_count = 0; - loop { - let span = warn_span!( - "seal", - miner = ?self.sector.base.as_ref().map(|b| b.allocated.id.miner), - sector = ?self.sector.base.as_ref().map(|b| b.allocated.id.number), - ?event, - ); - - let _enter = span.enter(); - - let prev = self.sector.state; - let is_empty = match self.sector.base.as_ref() { - None => true, - Some(base) => { - self.ctrl_ctx - .update_state(|cst| { - cst.job.id.replace(base.allocated.id.clone()); - }) - .crit()?; - false - } - }; - - let handle_res = self.handle(event.take()); - if is_empty { - if let Some(base) = self.sector.base.as_ref() { - self.ctrl_ctx - .update_state(|cst| { - cst.job.id.replace(base.allocated.id.clone()); - }) - .crit()?; - } - } else if self.sector.base.is_none() { - self.ctrl_ctx - .update_state(|cst| { - cst.job.id.take(); - }) - .crit()?; - } - - let fail = if let Err(eref) = handle_res.as_ref() { - Some(SectorFailure { - level: format!("{:?}", eref.0), - desc: format!("{:?}", eref.1), - }) - } else { - None - }; - - if let Err(rerr) = self.report_state( - SectorStateChange { - prev: prev.as_str().to_owned(), - next: self.sector.state.as_str().to_owned(), - event: format!("{:?}", event), - }, - fail, - ) { - error!("report state failed: {:?}", rerr); - }; - - match handle_res { - Ok(Some(evt)) => { - if let Event::Idle = evt { - task_idle_count += 1; - if task_idle_count > self.sealing_config.request_task_max_retries { - info!( - "The task has returned `Event::Idle` for more than {} times. break the task", - self.sealing_config.request_task_max_retries - ); - - // when the planner tries to request a task but fails(including no task) for more than - // `config::sealing::request_task_max_retries` times, this task is really considered idle, - // break this task loop. that we have a chance to reload `sealing_thread` hot config file, - // or do something else. - - if self.sealing_config.check_modified() { - // cleanup sector if the hot config modified - self.finalize()?; - } - return Ok(()); - } - } - event.replace(evt); - } - - Ok(None) => match self.report_finalized().context("report finalized") { - Ok(_) => { - self.finalize()?; - return Ok(()); - } - Err(terr) => self.retry(terr.1)?, - }, - - Err(Failure(Level::Abort, aerr)) => { - if let Err(rerr) = self.report_aborted(aerr.to_string()) { - error!("report aborted sector failed: {:?}", rerr); - } - - warn!("cleanup aborted sector"); - self.finalize()?; - return Err(aerr.abort()); - } - - Err(Failure(Level::Temporary, terr)) => self.retry(terr)?, - - Err(f) => return Err(f), - } - } - } - - fn retry(&mut self, temp_err: anyhow::Error) -> Result<(), Failure> { - if self.sector.retry >= self.sealing_config.max_retries { - // reset retry times; - self.sync(|s| { - s.retry = 0; - Ok(()) - })?; - - return Err(temp_err.perm()); - } - - self.sync(|s| { - warn!(retry = s.retry, "temp error occurred: {:?}", temp_err); - - s.retry += 1; - - Ok(()) - })?; - - info!( - interval = ?self.sealing_config.recover_interval, - "wait before recovering" - ); - - self.wait_or_interrupted(self.sealing_config.recover_interval)?; - Ok(()) - } - - fn sync) -> Result<()>>(&mut self, modify_fn: F) -> Result<(), Failure> { - modify_fn(self.sector.inner_mut()).crit()?; - self.sector.sync().context("sync sector").crit() - } - - fn finalize(self) -> Result<(), Failure> { - self.store.cleanup().context("cleanup store").crit()?; - self.sector.delete().context("remove sector").crit() - } - - fn sector_path(&self, sector_id: &SectorID) -> String { - format!("s-t0{}-{}", sector_id.miner, sector_id.number) - } - - fn prepared_dir(&self, sector_id: &SectorID) -> Entry { - Entry::dir(&self.store.data_path, PathBuf::from("prepared").join(self.sector_path(sector_id))) - } - - fn cache_dir(&self, sector_id: &SectorID) -> Entry { - Entry::dir(&self.store.data_path, PathBuf::from("cache").join(self.sector_path(sector_id))) - } - - fn sealed_file(&self, sector_id: &SectorID) -> Entry { - Entry::file(&self.store.data_path, PathBuf::from("sealed").join(self.sector_path(sector_id))) - } - - fn staged_file(&self, sector_id: &SectorID) -> Entry { - Entry::file(&self.store.data_path, PathBuf::from("unsealed").join(self.sector_path(sector_id))) - } - - fn piece_file(&self, piece_cid: &CidJson) -> Entry { - Entry::file(&self.store.data_path, PathBuf::from("unsealed").join(format!("{}", piece_cid.0))) - } - - fn update_file(&self, sector_id: &SectorID) -> Entry { - Entry::file(&self.store.data_path, PathBuf::from("update").join(self.sector_path(sector_id))) - } - - fn update_cache_dir(&self, sector_id: &SectorID) -> Entry { - Entry::dir( - &self.store.data_path, - PathBuf::from("update-cache").join(self.sector_path(sector_id)), - ) - } - - fn handle(&mut self, event: Option) -> Result, Failure> { - self.interrupted()?; - - let prev = self.sector.state; - let planner = get_planner(self.sector.plan.as_deref()).perm()?; - - if let Some(evt) = event { - match evt { - Event::Idle | Event::Retry => { - debug!( - prev = ?self.sector.state, - sleep = ?self.sealing_config.recover_interval, - "Event::{:?} captured", evt - ); - - self.wait_or_interrupted(self.sealing_config.recover_interval)?; - } - - other => { - self.sync(|s| other.apply(&planner, s))?; - } - }; - }; - - let span = warn_span!("handle", ?prev, current = ?self.sector.state); - - let _enter = span.enter(); - - self.ctrl_ctx - .update_state(|cst| { - let _ = std::mem::replace(&mut cst.job.state, self.sector.state); - }) - .crit()?; - - debug!("handling"); - - planner.exec(self) - } -} diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs b/damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs deleted file mode 100644 index d9b84c5db..000000000 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/mod.rs +++ /dev/null @@ -1,76 +0,0 @@ -use super::{Event, State, Task}; -use crate::sealing::failure::*; -use anyhow::{anyhow, Result}; - -pub const PLANNER_NAME_SEALER: &str = "sealer"; -pub const PLANNER_NAME_SNAPUP: &str = "snapup"; -pub const PLANNER_NAME_REBUILD: &str = "rebuild"; -pub const PLANNER_NAME_UNSEAL: &str = "unseal"; - -mod sealer; - -mod snapup; - -mod rebuild; - -mod common; - -mod unseal; - -mod wdpost; - -type ExecResult = Result; - -macro_rules! plan { - ($e:expr, $st:expr, $($prev:pat => {$($evt:pat => $next:expr,)+},)*) => { - match $st { - $( - $prev => { - match $e { - $( - $evt => $next, - )+ - _ => return Err(anyhow::anyhow!("unexpected event {:?} for state {:?}", $e, $st)), - } - } - )* - - other => return Err(anyhow::anyhow!("unexpected state {:?}", other)), - } - }; -} - -pub fn get_planner(p: Option<&str>) -> Result> { - match p { - None | Some(PLANNER_NAME_SEALER) => Ok(Box::new(sealer::SealerPlanner)), - - Some(PLANNER_NAME_SNAPUP) => Ok(Box::new(snapup::SnapUpPlanner)), - - Some(PLANNER_NAME_REBUILD) => Ok(Box::new(rebuild::RebuildPlanner)), - - Some(PLANNER_NAME_UNSEAL) => Ok(Box::new(unseal::UnsealPlanner)), - - Some(other) => Err(anyhow!("unknown planner {}", other)), - } -} - -pub fn default_plan() -> &'static str { - PLANNER_NAME_SEALER -} - -pub(self) use plan; - -pub trait Planner { - fn plan(&self, evt: &Event, st: &State) -> Result; - fn exec(&self, task: &mut Task<'_>) -> Result, Failure>; -} - -impl Planner for Box { - fn plan(&self, evt: &Event, st: &State) -> Result { - self.as_ref().plan(evt, st) - } - - fn exec(&self, task: &mut Task<'_>) -> Result, Failure> { - self.as_ref().exec(task) - } -} diff --git a/damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs b/damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs deleted file mode 100644 index 42929dfeb..000000000 --- a/damocles-worker/src/sealing/sealing_thread/task/planner/wdpost.rs +++ /dev/null @@ -1,271 +0,0 @@ -use super::super::{call_rpc, Event, State, Task}; -use super::{plan, ExecResult, Planner}; -use crate::logging::{error, warn}; -use crate::rpc::sealer::{AllocateSectorSpec, SectorID, WdPoStResult, WdpostState}; -use crate::sealing::failure::MapErrToFailure; -use crate::sealing::failure::{Failure, IntoFailure}; -use anyhow::{anyhow, Context, Result}; -use std::time::Duration; -use tracing::debug; -use vc_processors::builtin::tasks::{PoStReplicaInfo, WindowPoSt}; - -pub struct WdPostPlanner; - -impl Planner for WdPostPlanner { - fn plan(&self, evt: &Event, st: &State) -> Result { - let next = plan! { - evt, - st, - - State::Empty => { - // alloc wdpost task - Event::AcquireWdPostTask(_) => State::Allocated, - }, - State::Allocated => { - // gen prove and report persistent - Event::WdPostGenerated(_) => State::WdPostGenerated, - }, - State::WdPostGenerated => { - // verify prove - Event::Finish => State::Finished, - }, - }; - - Ok(next) - } - - fn exec(&self, task: &mut Task<'_>) -> Result, Failure> { - let state = task.sector.state; - let inner = WdPost { task }; - - match state { - State::Empty => inner.acquire(), - State::Allocated => inner.generate(), - State::WdPostGenerated => inner.upload(), - other => Err(anyhow!("unexpected state: {:?} in window post planner", other).abort()), - } - .map(From::from) - } -} - -struct WdPost<'c, 't> { - task: &'t mut Task<'c>, -} - -impl WdPost<'_, '_> { - fn acquire(&self) -> ExecResult { - let maybe_res = call_rpc!( - self.task.ctx.global.rpc, - allocate_wd_post_task, - AllocateSectorSpec { - allowed_miners: Some(self.task.sealing_config.allowed_miners.clone()), - allowed_proof_types: Some(self.task.sealing_config.allowed_proof_types.clone()), - }, - ); - - let maybe_allocated = match maybe_res { - Ok(a) => a, - Err(e) => { - warn!( - "window PoST task is not allocated yet, so we can retry even though we got the err {:?}", - e - ); - return Ok(Event::Idle); - } - }; - - let allocated = match maybe_allocated { - Some(a) => a, - None => return Ok(Event::Idle), - }; - - Ok(Event::AcquireWdPostTask(allocated)) - } - - fn upload(&self) -> ExecResult { - let out = self - .task - .sector - .phases - .wd_post_out - .clone() - .context("wdpost out info not found") - .abort()?; - - let wdpost_res = WdPoStResult { - state: WdpostState::Done, - proofs: Some(out.proofs), - faults: Some(out.faults), - error: None, - }; - - self.report(wdpost_res); - - Ok(Event::Finish) - } - - fn generate(&self) -> ExecResult { - let task_info = self - .task - .sector - .phases - .wd_post_in - .as_ref() - .context("wdpost info not found") - .abort()?; - - let instance_name = &task_info.instance; - debug!("find access store named {}", instance_name); - let instance = self - .task - .ctx - .global - .attached - .get(instance_name) - .with_context(|| format!("get access store instance named {}", instance_name)) - .perm()?; - - // get sealed path and cache path - let replica = task_info - .sectors - .iter() - .map(|sector| { - let sector_id = &SectorID { - miner: task_info.miner_id, - number: sector.sector_id.into(), - }; - - let sealed_temp = self.task.sealed_file(sector_id); - let sealed_rel = sealed_temp.rel(); - - let cache_temp = self.task.cache_dir(sector_id); - let cache_rel = cache_temp.rel(); - - let sealed_path = instance - .uri(sealed_rel) - .with_context(|| format!("get uri for sealed file {:?} in {}", sealed_rel, instance_name))?; - let cache_path = instance - .uri(cache_rel) - .with_context(|| format!("get uri for cache file {:?} in {}", cache_rel, instance_name))?; - - let sector_id = sector.sector_id; - let replica = PoStReplicaInfo { - sector_id, - comm_r: sector.comm_r, - cache_dir: cache_path, - sealed_file: sealed_path, - }; - Ok(replica) - }) - .collect::>>() - .perm()?; - - let post_in = WindowPoSt { - miner_id: task_info.miner_id, - proof_type: task_info.proof_type, - replicas: replica, - seed: task_info.seed, - }; - - let rt = tokio::runtime::Runtime::new().unwrap(); - let (tx_res, mut rx_res) = tokio::sync::oneshot::channel::>(); - let (tx_sync, rx_sync) = tokio::sync::oneshot::channel(); - - let rpc = self.task.ctx.global.rpc.clone(); - let miner_id = task_info.miner_id; - let deadline_id = task_info.deadline_id; - - rt.spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(20)); - - let mut rep = WdPoStResult { - state: WdpostState::Generating, - proofs: None, - faults: None, - error: None, - }; - - let report = |rep: WdPoStResult| { - if let Err(e) = call_rpc!(rpc, wd_post_heartbeat, miner_id, deadline_id, rep,) { - error!("report wdpost result failed: {:?}", e); - } - }; - - loop { - tokio::select! { - res = &mut rx_res => { - match res { - Ok(Ok(_)) => { - rep.state = WdpostState::Generated; - report(rep) - } - Ok(Err(e)) => { - rep.state = WdpostState::Failed; - rep.error = Some(format!("{:?}", e)); - report(rep) - } - Err(_) => { - error!("receive finish signal failed"); - } - } - break; - } - _ = interval.tick() => { - report(rep.clone()); - } - } - } - tx_sync.send(()).unwrap(); - }); - - let _rt_guard = rt.enter(); - - let out_maybe = self - .task - .ctx - .global - .processors - .wdpost - .process(post_in) - .context("generate window post"); - - // notify crond - match &out_maybe { - Ok(_) => { - if tx_res.send(Ok(())).is_err() { - warn!("send finish signal failed"); - } - } - Err(e) => { - if tx_res.send(Err(anyhow!("generate window post failed: {:?}", e))).is_err() { - warn!("send finish signal failed"); - } - warn!("generate window post failed: {:?}", e); - } - }; - - // wait for crond to finish - rx_sync.blocking_recv().unwrap(); - - let out = out_maybe.context("generate window post").temp()?; - - Ok(Event::WdPostGenerated(out)) - } - - fn report(&self, res: WdPoStResult) { - if let Some(task_info) = self.task.sector.phases.wd_post_in.as_ref() { - let resp = call_rpc!( - self.task.ctx.global.rpc, - wd_post_heartbeat, - task_info.miner_id, - task_info.deadline_id, - res, - ); - if let Err(e) = resp { - warn!("report wdpost result failed: {:?}", e); - } - } else { - warn!("wdpost info not found"); - } - } -} diff --git a/damocles-worker/src/sealing/sealing_thread/task/util.rs b/damocles-worker/src/sealing/sealing_thread/util.rs similarity index 82% rename from damocles-worker/src/sealing/sealing_thread/task/util.rs rename to damocles-worker/src/sealing/sealing_thread/util.rs index 97a35e1c3..3a76d9279 100644 --- a/damocles-worker/src/sealing/sealing_thread/task/util.rs +++ b/damocles-worker/src/sealing/sealing_thread/util.rs @@ -1,17 +1,9 @@ macro_rules! call_rpc { - ($client:expr, $method:ident, $($arg:expr,)*) => { + (raw, $client:expr=>$method:ident($($arg:expr,)*)) => { { crate::metrics::rpc::VIEW.call.method(stringify!($method)).incr(); let now = std::time::Instant::now(); - let res = crate::block_on($client.$method($($arg,)*)).map_err(|e| { - if let jsonrpc_core_client::RpcError::JsonRpcError(ref je) = e { - if je.code == jsonrpc_core::types::error::ErrorCode::ServerError(crate::rpc::APIErrCode::SectorStateNotFound as i64) { - return anyhow::anyhow!("from error code: sector state not found, with msg: {}", je.message).abort() - } - } - - anyhow::anyhow!("rpc error: {:?}", e).temp() - }); + let res = crate::block_on($client.$method($($arg,)*)); crate::metrics::rpc::VIEW.timing.method(stringify!($method)).record(now.elapsed()); @@ -22,6 +14,19 @@ macro_rules! call_rpc { res } }; + ($client:expr=>$method:ident($($arg:expr,)*)) => { + { + call_rpc!(raw, $client=>$method($($arg,)*)).map_err(|e| { + if let jsonrpc_core_client::RpcError::JsonRpcError(ref je) = e { + if je.code == jsonrpc_core::types::error::ErrorCode::ServerError(crate::rpc::APIErrCode::SectorStateNotFound as i64) { + return anyhow::anyhow!("from error code: sector state not found, with msg: {}", je.message).abort() + } + } + + anyhow::anyhow!("rpc error: {}", e).temp() + }) + } + }; } pub(super) use call_rpc; diff --git a/damocles-worker/src/sealing/service.rs b/damocles-worker/src/sealing/service.rs index 2d7229b7e..ae35d0308 100644 --- a/damocles-worker/src/sealing/service.rs +++ b/damocles-worker/src/sealing/service.rs @@ -1,4 +1,5 @@ use std::env; +use std::path::PathBuf; use std::sync::Arc; use crossbeam_channel::select; @@ -29,10 +30,10 @@ impl Worker for ServiceImpl { self.ctrls .iter() .map(|(idx, ctrl)| { - let (state, sector_id, plan, last_error, paused_at) = ctrl + let (state, job_id, plan, last_error, paused_at) = ctrl .load_state(|cst| { ( - cst.job.state, + cst.job.state.clone(), cst.job.id.to_owned(), cst.job.plan.clone(), cst.job.last_error.to_owned(), @@ -46,13 +47,17 @@ impl Worker for ServiceImpl { })?; Ok(WorkerInfo { - location: ctrl.location.to_pathbuf(), + location: ctrl + .location + .as_ref() + .map(|loc| loc.to_pathbuf()) + .unwrap_or_else(|| PathBuf::from("-")), plan, - sector_id, + job_id, index: *idx, paused: paused_at.is_some(), paused_elapsed: paused_at.map(|ins| ins.elapsed().as_secs()), - state: state.as_str().to_owned(), + state: state.unwrap_or(String::new()), last_error, }) }) diff --git a/damocles-worker/src/types.rs b/damocles-worker/src/types.rs index d09353c4a..a6f7ce66f 100644 --- a/damocles-worker/src/types.rs +++ b/damocles-worker/src/types.rs @@ -4,6 +4,7 @@ use std::convert::TryFrom; use anyhow::{anyhow, Error}; use serde_repr::{Deserialize_repr, Serialize_repr}; +use vc_processors::fil_proofs::RegisteredPoStProof; use crate::sealing::processor::{RegisteredSealProof, RegisteredUpdateProof}; @@ -65,6 +66,21 @@ impl SealProof { SealProof::StackedDrg64GiBV1_1 => SIZE_64G, } } + + /// returns post proof types for the seal proof type + pub fn to_post_proofs(&self) -> Vec { + use RegisteredPoStProof::*; + use SealProof::*; + match self { + StackedDrg2KiBV1 | StackedDrg2KiBV1_1 => vec![StackedDrgWinning2KiBV1, StackedDrgWindow2KiBV1, StackedDrgWindow2KiBV1_2], + StackedDrg8MiBV1 | StackedDrg8MiBV1_1 => vec![StackedDrgWinning8MiBV1, StackedDrgWindow8MiBV1, StackedDrgWindow8MiBV1_2], + StackedDrg512MiBV1 | StackedDrg512MiBV1_1 => { + vec![StackedDrgWinning512MiBV1, StackedDrgWindow512MiBV1, StackedDrgWindow512MiBV1_2] + } + StackedDrg32GiBV1 | StackedDrg32GiBV1_1 => vec![StackedDrgWinning32GiBV1, StackedDrgWindow32GiBV1, StackedDrgWindow32GiBV1_2], + StackedDrg64GiBV1 | StackedDrg64GiBV1_1 => vec![StackedDrgWinning64GiBV1, StackedDrgWindow64GiBV1, StackedDrgWindow64GiBV1_2], + } + } } impl TryFrom for SealProof { From a0c1412909ba5cb59e8b000759f264cb27d0a453 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 28 Jul 2023 11:55:39 +0800 Subject: [PATCH 13/18] fix(worker-prover): exec windowpost --- .../damocles-manager/internal/util_worker.go | 4 +- damocles-manager/core/types_wdpost.go | 6 +- damocles-manager/dep/prover.go | 13 +- damocles-manager/modules/config.go | 22 +++ .../modules/impl/prover/worker/config.go | 21 --- .../modules/impl/prover/worker/job_mgr_kv.go | 41 +++--- .../modules/impl/prover/worker/prover.go | 132 +++++++++++++----- .../modules/impl/prover/worker/rpc.go | 4 +- .../src/bin/damocles-worker/store.rs | 4 +- damocles-worker/src/config.rs | 1 + .../src/sealing/sealing_thread/mod.rs | 2 +- .../src/sealing/sealing_thread/planner/mod.rs | 2 +- 12 files changed, 164 insertions(+), 88 deletions(-) delete mode 100644 damocles-manager/modules/impl/prover/worker/config.go diff --git a/damocles-manager/cmd/damocles-manager/internal/util_worker.go b/damocles-manager/cmd/damocles-manager/internal/util_worker.go index bd4586856..8061ae3fd 100644 --- a/damocles-manager/cmd/damocles-manager/internal/util_worker.go +++ b/damocles-manager/cmd/damocles-manager/internal/util_worker.go @@ -344,9 +344,9 @@ var utilWdPostListCmd = &cli.Command{ w := tabwriter.NewWriter(os.Stdout, 2, 4, 2, ' ', 0) if detail { - _, err = w.Write([]byte("ID\tPrefix\tMiner\tDDL\tWorker\tState\tTry\tCreateAt\tStartedAt\tHeartbeatAt\tFinishedAt\tUpdatedAt\tError\n")) + _, err = w.Write([]byte("JobID\tPrefix\tMiner\tDDL\tWorker\tState\tTry\tCreateAt\tStartedAt\tHeartbeatAt\tFinishedAt\tUpdatedAt\tError\n")) } else { - _, err = w.Write([]byte("ID\tMinerID\tDDL\tWorker\tState\tTry\tCreateAt\tElapsed\tError\n")) + _, err = w.Write([]byte("JobID\tMinerID\tDDL\tWorker\tState\tTry\tCreateAt\tElapsed\tError\n")) } if err != nil { return err diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go index b89ccb0a7..be18cdd9a 100644 --- a/damocles-manager/core/types_wdpost.go +++ b/damocles-manager/core/types_wdpost.go @@ -62,7 +62,7 @@ func (t *WdPoStJob) Succeed() bool { if t.State != string(WdPoStJobFinished) { return false } - return t.Output != nil + return t.ErrorReason == "" } func (t *WdPoStJob) DisplayState() string { @@ -88,12 +88,12 @@ type WdPoStAllocatedJob struct { type AllocateWdPoStJobSpec struct { AllowedMiners []abi.ActorID - AllowedProofTypes []abi.RegisteredPoStProof + AllowedProofTypes []string } type WorkerWdPoStJobManager interface { All(ctx context.Context, filter func(*WdPoStJob) bool) ([]*WdPoStJob, error) - ListByJobIDs(ctx context.Context, state WdPoStJobState, jobIDs ...string) ([]*WdPoStJob, error) + ListByJobIDs(ctx context.Context, jobIDs ...string) ([]*WdPoStJob, error) Create(ctx context.Context, deadlineIdx uint64, input WdPoStInput) (*WdPoStJob, error) AllocateJobs(ctx context.Context, spec AllocateWdPoStJobSpec, num uint32, workerName string) (allocatedJobs []*WdPoStAllocatedJob, err error) Heartbeat(ctx context.Context, jobIDs []string, workerName string) error diff --git a/damocles-manager/dep/prover.go b/damocles-manager/dep/prover.go index 6bbd700c6..7934e2ce8 100644 --- a/damocles-manager/dep/prover.go +++ b/damocles-manager/dep/prover.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "fmt" + "time" "github.com/BurntSushi/toml" "github.com/dtynn/dix" @@ -32,7 +33,6 @@ func ExtProver() dix.Option { func WorkerProver() dix.Option { return dix.Options( dix.Override(new(WorkerProverStore), BuildWorkerProverStore), - dix.Override(new(*proverworker.Config), proverworker.DefaultConfig), dix.Override(new(core.WorkerWdPoStJobManager), BuildWorkerWdPoStJobManager), dix.Override(new(core.WorkerWdPoStAPI), proverworker.NewWdPoStAPIImpl), dix.Override(new(core.Prover), BuildWorkerProver), @@ -92,8 +92,15 @@ func BuildWorkerProverStore(gctx GlobalContext, db UnderlyingDB) (WorkerProverSt return db.OpenCollection(gctx, "prover") } -func BuildWorkerProver(lc fx.Lifecycle, jobMgr core.WorkerWdPoStJobManager, sectorTracker core.SectorTracker, config *proverworker.Config) (core.Prover, error) { - p := proverworker.NewProver(jobMgr, sectorTracker, config) +func BuildWorkerProver(lc fx.Lifecycle, jobMgr core.WorkerWdPoStJobManager, sectorTracker core.SectorTracker, scfg *modules.SafeConfig) (core.Prover, error) { + cfg := scfg.MustCommonConfig() + p := proverworker.NewProver(jobMgr, sectorTracker, &proverworker.Config{ + RetryFailedJobsInterval: 10 * time.Second, + JobMaxTry: cfg.Proving.WorkerProver.JobMaxTry, + HeartbeatTimeout: cfg.Proving.WorkerProver.HeartbeatTimeout, + CleanupExpiredJobsInterval: 30 * time.Minute, + JobLifetime: cfg.Proving.WorkerProver.JobLifetime, + }) lc.Append(fx.Hook{ OnStart: func(ctx context.Context) error { p.Start(ctx) diff --git a/damocles-manager/modules/config.go b/damocles-manager/modules/config.go index cf1afb5dd..e80848f4c 100644 --- a/damocles-manager/modules/config.go +++ b/damocles-manager/modules/config.go @@ -205,6 +205,8 @@ type ProvingConfig struct { // WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are // blocked or slow PartitionCheckTimeout Duration + + WorkerProver *WorkerProverConfig } func defaultProvingConfig() ProvingConfig { @@ -212,10 +214,30 @@ func defaultProvingConfig() ProvingConfig { ParallelCheckLimit: 128, PartitionCheckTimeout: Duration(20 * time.Minute), SingleCheckTimeout: Duration(10 * time.Minute), + WorkerProver: DefaultWorkerProverConfig(), } return cfg } +type WorkerProverConfig struct { + // The maximum number of attempts of the WindowPoSt job, + // job that exceeds the JobMaxTry number can only be re-executed by manual reset + JobMaxTry uint32 + // The timeout of the WindowPoSt job's heartbeat + // jobs that have not sent a heartbeat for more than this time will be set to fail and retried + HeartbeatTimeout time.Duration + // WindowPoSt jobs created longer than this time will be deleted + JobLifetime time.Duration +} + +func DefaultWorkerProverConfig() *WorkerProverConfig { + return &WorkerProverConfig{ + JobMaxTry: 2, + HeartbeatTimeout: 15 * time.Second, + JobLifetime: 25 * time.Hour, + } +} + type CommonConfig struct { API CommonAPIConfig Plugins *PluginConfig diff --git a/damocles-manager/modules/impl/prover/worker/config.go b/damocles-manager/modules/impl/prover/worker/config.go deleted file mode 100644 index 98c77614d..000000000 --- a/damocles-manager/modules/impl/prover/worker/config.go +++ /dev/null @@ -1,21 +0,0 @@ -package worker - -import "time" - -type Config struct { - RetryFailedJobsInterval time.Duration - JobMaxTry uint32 - HeartbeatTimeout time.Duration - CleanupExpiredJobsInterval time.Duration - JobLifetime time.Duration -} - -func DefaultConfig() *Config { - return &Config{ - RetryFailedJobsInterval: 10 * time.Second, - JobMaxTry: 2, - HeartbeatTimeout: 15 * time.Second, - CleanupExpiredJobsInterval: 30 * time.Minute, - JobLifetime: 25 * time.Hour, - } -} diff --git a/damocles-manager/modules/impl/prover/worker/job_mgr_kv.go b/damocles-manager/modules/impl/prover/worker/job_mgr_kv.go index 9ffa39238..6d8dea5fa 100644 --- a/damocles-manager/modules/impl/prover/worker/job_mgr_kv.go +++ b/damocles-manager/modules/impl/prover/worker/job_mgr_kv.go @@ -10,7 +10,6 @@ import ( "strings" "time" - "github.com/filecoin-project/go-state-types/abi" "github.com/ipfs-force-community/damocles/damocles-manager/core" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/extproc/stage" "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" @@ -47,10 +46,14 @@ func (tm *kvJobManager) filter(ctx context.Context, txn kvstore.TxnExt, state co return } +func allStates() []core.WdPoStJobState { + return []core.WdPoStJobState{core.WdPoStJobReadyToRun, core.WdPoStJobRunning, core.WdPoStJobFinished} +} + func (tm *kvJobManager) All(ctx context.Context, filter func(*core.WdPoStJob) bool) (jobs []*core.WdPoStJob, err error) { jobs = make([]*core.WdPoStJob, 0) err = tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { - for _, state := range []core.WdPoStJobState{core.WdPoStJobReadyToRun, core.WdPoStJobRunning, core.WdPoStJobFinished} { + for _, state := range allStates() { ts, err := tm.filter(ctx, txn, state, math.MaxUint32, filter) if err != nil { return err @@ -65,19 +68,21 @@ func (tm *kvJobManager) All(ctx context.Context, filter func(*core.WdPoStJob) bo return } -func (tm *kvJobManager) ListByJobIDs(ctx context.Context, state core.WdPoStJobState, jobIDs ...string) ([]*core.WdPoStJob, error) { +func (tm *kvJobManager) ListByJobIDs(ctx context.Context, jobIDs ...string) ([]*core.WdPoStJob, error) { jobs := make([]*core.WdPoStJob, 0, len(jobIDs)) err := tm.kv.ViewMustNoConflict(ctx, func(txn kvstore.TxnExt) error { for _, jobID := range jobIDs { - var job core.WdPoStJob - err := txn.Peek(kvstore.Key(makeWdPoStKey(state, jobID)), kvstore.LoadJSON(&job)) - if errors.Is(err, kvstore.ErrKeyNotFound) { - continue + for _, state := range allStates() { + var job core.WdPoStJob + err := txn.Peek(kvstore.Key(makeWdPoStKey(state, jobID)), kvstore.LoadJSON(&job)) + if errors.Is(err, kvstore.ErrKeyNotFound) { + continue + } + if err != nil { + return err + } + jobs = append(jobs, &job) } - if err != nil { - return err - } - jobs = append(jobs, &job) } return nil }) @@ -87,7 +92,7 @@ func (tm *kvJobManager) ListByJobIDs(ctx context.Context, state core.WdPoStJobSt func (tm *kvJobManager) Create(ctx context.Context, deadlineIdx uint64, input core.WdPoStInput) (*core.WdPoStJob, error) { var ( jobID string - job *core.WdPoStJob + job core.WdPoStJob ) err := tm.kv.UpdateMustNoConflict(ctx, func(txn kvstore.TxnExt) error { rawInput, err := json.Marshal(input) @@ -97,7 +102,7 @@ func (tm *kvJobManager) Create(ctx context.Context, deadlineIdx uint64, input co jobID = GenJobID(rawInput) // check if job exists _, err = txn.PeekAny( - kvstore.LoadJSON(job), + kvstore.LoadJSON(&job), kvstore.Key(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), kvstore.Key(makeWdPoStKey(core.WdPoStJobRunning, jobID)), kvstore.Key(makeWdPoStKey(core.WdPoStJobFinished, jobID)), @@ -111,7 +116,7 @@ func (tm *kvJobManager) Create(ctx context.Context, deadlineIdx uint64, input co } now := time.Now().Unix() - job = &core.WdPoStJob{ + job = core.WdPoStJob{ ID: jobID, State: string(core.WdPoStJobReadyToRun), DeadlineIdx: deadlineIdx, @@ -126,13 +131,13 @@ func (tm *kvJobManager) Create(ctx context.Context, deadlineIdx uint64, input co CreatedAt: uint64(now), UpdatedAt: uint64(now), } - return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), job) + return txn.PutJson([]byte(makeWdPoStKey(core.WdPoStJobReadyToRun, jobID)), &job) }) if err == nil { log.Infof("wdPoSt job created: %s", jobID) } - return job, err + return &job, err } func (tm *kvJobManager) AllocateJobs(ctx context.Context, spec core.AllocateWdPoStJobSpec, n uint32, workerName string) (allocatedJobs []*core.WdPoStAllocatedJob, err error) { @@ -143,9 +148,7 @@ func (tm *kvJobManager) AllocateJobs(ctx context.Context, spec core.AllocateWdPo if len(spec.AllowedMiners) > 0 && !slices.Contains(spec.AllowedMiners, t.Input.MinerID) { return false } - if len(spec.AllowedProofTypes) > 0 && !slices.ContainsFunc(spec.AllowedProofTypes, func(allowed abi.RegisteredPoStProof) bool { - return stage.ProofType2String(allowed) == t.Input.ProofType - }) { + if len(spec.AllowedProofTypes) > 0 && !slices.Contains(spec.AllowedProofTypes, t.Input.ProofType) { return false } return true diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go index 77099782a..e514649a6 100644 --- a/damocles-manager/modules/impl/prover/worker/prover.go +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -3,6 +3,7 @@ package worker import ( "context" "encoding/binary" + "errors" "fmt" "sync" "time" @@ -22,6 +23,8 @@ var log = logging.New("worker prover") var _ core.Prover = (*WorkerProver)(nil) +var ErrJobRemovedManually = fmt.Errorf("job was manually removed") + func GenJobID(rawInput []byte) string { b := make([]byte, 8) binary.LittleEndian.PutUint64(b, xxhash.Sum64(rawInput)) @@ -30,7 +33,20 @@ func GenJobID(rawInput []byte) string { type R struct { output *stage.WindowPoStOutput - err string + err error +} + +type Config struct { + RetryFailedJobsInterval time.Duration + // The maximum number of attempts of the WindowPoSt job, + // job that exceeds the JobMaxTry number can only be re-executed by manual reset + JobMaxTry uint32 + // The timeout of the WindowPoSt job's heartbeat + // jobs that have not sent a heartbeat for more than this time will be set to fail and retried + HeartbeatTimeout time.Duration + CleanupExpiredJobsInterval time.Duration + // WindowPoSt jobs created longer than this time will be deleted + JobLifetime time.Duration } type WorkerProver struct { @@ -76,13 +92,46 @@ func (p *WorkerProver) runNotifyJobDone(ctx context.Context) { } p.inflightJobsLock.Unlock() - finishedJobs, err := p.jobMgr.ListByJobIDs(ctx, core.WdPoStJobFinished, inflightJobIDs...) + jobs, err := p.jobMgr.ListByJobIDs(ctx, inflightJobIDs...) if err != nil { log.Errorf("failed to list jobs: %s", err) } + // find all manually deleted jobs + var ( + removed []string + notRemoved map[string]struct{} = make(map[string]struct{}) + ) + for _, job := range jobs { + notRemoved[job.ID] = struct{}{} + } + for _, jobID := range inflightJobIDs { + if _, ok := notRemoved[jobID]; !ok { + removed = append(removed, jobID) + log.Infow("job was manually removed", "jobID", jobID) + } + } + p.inflightJobsLock.Lock() - for _, job := range finishedJobs { + // notify `pster module` that jobs have been manually deleted + for _, jobID := range removed { + chs, ok := p.inflightJobs[jobID] + if !ok { + continue + } + delete(p.inflightJobs, jobID) + + for _, ch := range chs { + ch <- R{ + output: nil, + err: ErrJobRemovedManually, + } + close(ch) + } + } + + // notify the poster module of the results of jobs + for _, job := range jobs { chs, ok := p.inflightJobs[job.ID] if !ok { continue @@ -93,9 +142,15 @@ func (p *WorkerProver) runNotifyJobDone(ctx context.Context) { delete(p.inflightJobs, job.ID) for _, ch := range chs { + var err error + if job.ErrorReason == "" { + err = nil + } else { + err = fmt.Errorf("error from worker: %s", job.ErrorReason) + } ch <- R{ output: job.Output, - err: job.ErrorReason, + err: err, } close(ch) } @@ -146,6 +201,7 @@ func (p *WorkerProver) AggregateSealProofs(ctx context.Context, aggregateInfo co } func (p *WorkerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { + randomness[31] &= 0x3f sis := make([]core.WdPoStSectorInfo, len(sectors)) for i, s := range sectors { @@ -173,56 +229,66 @@ func (p *WorkerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint6 } copy(input.Seed[:], randomness[:]) - job, err := p.jobMgr.Create(ctx, deadlineIdx, input) - if err != nil { - return nil, nil, fmt.Errorf("create wdPoSt job: %w", err) - } - - ch := make(chan R, 1) - - p.inflightJobsLock.Lock() - p.inflightJobs[job.ID] = append(p.inflightJobs[job.ID], ch) - p.inflightJobsLock.Unlock() - - var result R - select { - case <-ctx.Done(): - err = fmt.Errorf("failed to generate window post before context cancellation: %w", ctx.Err()) - return - case res, ok := <-ch: - if !ok { - return nil, nil, fmt.Errorf("wdPoSt result channel was closed unexpectedly") + var output *stage.WindowPoStOutput + for { + output, err = p.doWindowPoSt(ctx, deadlineIdx, input) + if !errors.Is(err, ErrJobRemovedManually) { + break } - result = res } - - if result.err != "" { - return nil, nil, fmt.Errorf("error from worker: %s", result.err) + if err != nil { + return nil, nil, err } - if faultCount := len(result.output.Faults); faultCount != 0 { + if faultCount := len(output.Faults); faultCount != 0 { faults := make([]abi.SectorID, faultCount) - for fi := range result.output.Faults { + for fi := range output.Faults { faults[fi] = abi.SectorID{ Miner: minerID, - Number: result.output.Faults[fi], + Number: output.Faults[fi], } } return nil, faults, fmt.Errorf("got %d fault sectors", faultCount) } - proofs := make([]builtin.PoStProof, len(result.output.Proofs)) - for pi := range result.output.Proofs { + proofs := make([]builtin.PoStProof, len(output.Proofs)) + for pi := range output.Proofs { proofs[pi] = builtin.PoStProof{ PoStProof: proofType, - ProofBytes: result.output.Proofs[pi], + ProofBytes: output.Proofs[pi], } } return proofs, nil, nil } +func (p *WorkerProver) doWindowPoSt(ctx context.Context, deadlineIdx uint64, input core.WdPoStInput) (output *stage.WindowPoStOutput, err error) { + job, err := p.jobMgr.Create(ctx, deadlineIdx, input) + if err != nil { + return nil, fmt.Errorf("create wdPoSt job: %w", err) + } + + ch := make(chan R, 1) + + p.inflightJobsLock.Lock() + p.inflightJobs[job.ID] = append(p.inflightJobs[job.ID], ch) + p.inflightJobsLock.Unlock() + + select { + case <-ctx.Done(): + err = fmt.Errorf("failed to generate window post before context cancellation: %w", ctx.Err()) + return + case res, ok := <-ch: + if !ok { + return nil, fmt.Errorf("wdPoSt result channel was closed unexpectedly") + } + output = res.output + err = res.err + } + return +} + func (p *WorkerProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { return p.localProver.GenerateWinningPoSt(ctx, minerID, proofType, sectors, randomness) } diff --git a/damocles-manager/modules/impl/prover/worker/rpc.go b/damocles-manager/modules/impl/prover/worker/rpc.go index 0c59172aa..d3879ed3a 100644 --- a/damocles-manager/modules/impl/prover/worker/rpc.go +++ b/damocles-manager/modules/impl/prover/worker/rpc.go @@ -10,16 +10,14 @@ import ( "github.com/ipfs-force-community/damocles/damocles-manager/pkg/kvstore" ) -func NewWdPoStAPIImpl(jobMgr core.WorkerWdPoStJobManager, config *Config) core.WorkerWdPoStAPI { +func NewWdPoStAPIImpl(jobMgr core.WorkerWdPoStJobManager) core.WorkerWdPoStAPI { return &WdPoStAPIImpl{ jobMgr: jobMgr, - config: config, } } type WdPoStAPIImpl struct { jobMgr core.WorkerWdPoStJobManager - config *Config } func (api WdPoStAPIImpl) WdPoStHeartbeatJobs(ctx context.Context, runningJobIDs []string, workerName string) (core.Meta, error) { diff --git a/damocles-worker/src/bin/damocles-worker/store.rs b/damocles-worker/src/bin/damocles-worker/store.rs index d0653fd38..d5c30e316 100644 --- a/damocles-worker/src/bin/damocles-worker/store.rs +++ b/damocles-worker/src/bin/damocles-worker/store.rs @@ -28,13 +28,13 @@ pub(crate) enum StoreCommand { }, HugepageFileInit { /// Specify the numa node - #[arg(short = 'n', long, alias = "numa_node_index")] + #[arg(short = 'n', long = "node", alias = "numa_node_index")] numa_node_index: u32, /// Specify the size of each hugepage memory file. (e.g., 1B, 2KB, 3kiB, 1MB, 2MiB, 3GB, 1GiB, ...) #[arg(short = 's', long)] size: bytesize::ByteSize, /// Specify the number of hugepage memory files to be created - #[arg(short = 'c', long, alias = "number_of_files")] + #[arg(short = 'c', long = "num", alias = "number_of_files")] number_of_files: usize, /// Specify the path to the output hugepage memory files and using the default pattern (/specified_hugepage_file_path/numa_$NUMA_NODE_INDEX). /// The created files looks like this: diff --git a/damocles-worker/src/config.rs b/damocles-worker/src/config.rs index 6b1a9fc55..6823abeb2 100644 --- a/damocles-worker/src/config.rs +++ b/damocles-worker/src/config.rs @@ -274,6 +274,7 @@ pub struct Config { pub sector_manager: SectorManagerConfig, /// section for common sealing + #[serde(default)] pub sealing: SealingOptional, /// section for list of local sealing stores diff --git a/damocles-worker/src/sealing/sealing_thread/mod.rs b/damocles-worker/src/sealing/sealing_thread/mod.rs index 5177b65db..92a573eff 100644 --- a/damocles-worker/src/sealing/sealing_thread/mod.rs +++ b/damocles-worker/src/sealing/sealing_thread/mod.rs @@ -126,7 +126,7 @@ impl SealingThread { .update_state(|cst| cst.job.plan = plan.clone()) .context("update ctrl state") .crit()?; - let mut sealer = planner::create_selaer(&plan, ctx, self).crit()?; + let mut sealer = planner::create_sealer(&plan, ctx, self).crit()?; match sealer.seal(state.as_deref())? { R::SwitchPlanner(new_plan) => { tracing::info!(new_plan = new_plan, "switch planner"); diff --git a/damocles-worker/src/sealing/sealing_thread/planner/mod.rs b/damocles-worker/src/sealing/sealing_thread/planner/mod.rs index 19adf4a46..9463aa37d 100644 --- a/damocles-worker/src/sealing/sealing_thread/planner/mod.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/mod.rs @@ -61,7 +61,7 @@ pub trait PlannerTrait: Default { fn apply(&self, event: Self::Event, state: Self::State, job: &mut Self::Job) -> Result<()>; } -pub fn create_selaer(plan: &str, ctx: &Ctx, st: &SealingThread) -> Result> { +pub fn create_sealer(plan: &str, ctx: &Ctx, st: &SealingThread) -> Result> { match plan { PLANNER_NAME_SEALER => Ok(Box::new(CommonSealer::::new(ctx, st)?)), PLANNER_NAME_SNAPUP => Ok(Box::new(CommonSealer::::new(ctx, st)?)), From f22eaa61fe92e3864116aa657ec0f26670715802 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 28 Jul 2023 13:03:36 +0800 Subject: [PATCH 14/18] docs(worker-prover): add docs --- docs/en/03.damocles-worker-config.md | 4 +- docs/en/04.damocles-manager-config.md | 25 ++- ...15\347\275\256\350\247\243\346\236\220.md" | 4 +- ...15\347\275\256\350\247\243\346\236\220.md" | 25 ++- ...\232\204poster\350\212\202\347\202\271.md" | 160 +++++++++++++++++- 5 files changed, 209 insertions(+), 9 deletions(-) diff --git a/docs/en/03.damocles-worker-config.md b/docs/en/03.damocles-worker-config.md index d79c8e87a..cec77ca7c 100644 --- a/docs/en/03.damocles-worker-config.md +++ b/docs/en/03.damocles-worker-config.md @@ -314,7 +314,7 @@ location = "/mnt/nvme1/store" # task type, optional, string type # Default value is null -# Optionally use `sealer` or `snapup`, when left blank, the default is `sealer` +# All options: sealer | snapup | rebuild | unseal | wdpost, when left blank, the default is `sealer` # plan = "snapup" # Custom parameters of the sealing process, only applies to the current worker thread @@ -391,7 +391,7 @@ The `sealing_thread` in `damocles-worker` will check the `config.toml` file in t # Task type, optional, string type # Default value is null -# Other options sealer | snapup | rebuild, if not filled, the default is sealer +# All options: sealer | snapup | rebuild | unseal | wdpost, if not filled, the default is sealer # plan = "rebuild" # Custom parameters of the sealing process, only effective on the current worker thread diff --git a/docs/en/04.damocles-manager-config.md b/docs/en/04.damocles-manager-config.md index dcfc64e3f..a58453280 100644 --- a/docs/en/04.damocles-manager-config.md +++ b/docs/en/04.damocles-manager-config.md @@ -43,7 +43,11 @@ After initialization, we can get a copy of the default configuration: #ParallelCheckLimit = 128 #SingleCheckTimeout = "10m0s" #PartitionCheckTimeout = "20m0s" -# +[Common.Proving.WorkerProver] +JobMaxTry = 2 +HeartbeatTimeout = "15s" +JobLifetime = "25h0m0s" + [[Miners]] #Actor = 10086 [Miners.Sector] @@ -351,6 +355,25 @@ example: #PartitionCheckTimeout = "20m0s" ``` +### [Common.Proving.WorkerProver] +Used to configure the worker prover module + +example: +```toml +# The maximum number of attempts of the WindowPoSt job, optional, number type +# Default is 2 +# job that exceeds the JobMaxTry number can only be re-executed by manual reset +JobMaxTry = 2 +# The timeout of the WindowPoSt job's heartbeat, optional, time type +# Default is 15s +# jobs that have not sent a heartbeat for more than this time will be set to fail and retried +HeartbeatTimeout = "15s" +# The heartbeat timeout of the WindowPoSt job, optional, time type +# Default is 25h +# WindowPoSt jobs created longer than this time will be deleted +JobLifetime = "25h0m0s" +``` + ### [Common.DB] `Common.DB` is used to configure KV database used by `damocles-manager` during sealing. Currently, the `badger` local database and `mongo` database are supported. diff --git "a/docs/zh/03.damocles-worker\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" "b/docs/zh/03.damocles-worker\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" index 4e3f9e848..ac6cd00e8 100644 --- "a/docs/zh/03.damocles-worker\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" +++ "b/docs/zh/03.damocles-worker\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" @@ -325,7 +325,7 @@ location = "/mnt/nvme1/store" # 任务类型,选填项,字符串类型 # 默认值为 null -# 可选填 sealer 或 snapup, 当不填写时,默认等效为 sealer +# 可选项: sealer | snapup | rebuild | unseal | wdpost, 当不填写时,默认等效为 sealer # plan = "snapup" # 封装过程的定制参数,仅对当前工作线程生效 @@ -409,7 +409,7 @@ damocles-worker 中的 `sealing_thread` 会在新的扇区任务开始之前检 # 任务类型,选填项,字符串类型 # 默认值为 null -# 可选填 sealer | snapup | rebuild, 当不填写时,默认等效为 sealer +# 可选项: sealer | snapup | rebuild | unseal | wdpost, 当不填写时,默认等效为 sealer # plan = "rebuild" # 封装过程的定制参数,仅对当前工作线程生效 diff --git "a/docs/zh/04.damocles-manager\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" "b/docs/zh/04.damocles-manager\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" index 82f554c23..243611baa 100644 --- "a/docs/zh/04.damocles-manager\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" +++ "b/docs/zh/04.damocles-manager\347\232\204\351\205\215\347\275\256\350\247\243\346\236\220.md" @@ -54,7 +54,11 @@ #ParallelCheckLimit = 128 #SingleCheckTimeout = "10m0s" #PartitionCheckTimeout = "20m0s" -# +[Common.Proving.WorkerProver] +JobMaxTry = 2 +HeartbeatTimeout = "15s" +JobLifetime = "25h0m0s" + [[Miners]] #Actor = 10086 [Miners.Sector] @@ -422,6 +426,25 @@ DatabaseName = "test" #PartitionCheckTimeout = "20m0s" ``` +### [Common.Proving.WorkerProver] +用于配置 worker prover 模块 + +配置范例: +```toml +# WindowPoSt 任务的最大尝试次数, 可选项, 数字类型 +# 默认值为 2 +# 尝试次数超过 JobMaxTry 的 WindowPoSt 任务只能通过手动 reset 的方式被重新执行 +JobMaxTry = 2 +# WindowPoSt 任务的心跳超时时间, 可选项, 时间字符串类型 +# 默认值为 15s +# 超过此时间没有发送心跳的任务将会被设置为失败并重试 +HeartbeatTimeout = "15s" +# WindowPoSt 任务的心跳超时时间, 可选项, 时间字符串类型 +# 默认值为 25h +# 创建时间超过此时间的 WindowPoSt 任务将会被删除 +JobLifetime = "25h0m0s" +``` + ### [Common.DB] `Common.DB` 用于配置 sealing 过程中使用的 KV 数据库。目前支持 `badger` 本地数据库和 `mongo` 数据库。 diff --git "a/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" "b/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" index 0bd5693ae..8034180f7 100644 --- "a/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" +++ "b/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" @@ -5,6 +5,163 @@ 以下,我们会介绍这些新的功能点,并提供一种通过这些功能完成独立 PoSter 节点部署的实践。后续文档都以开启 `--poster` 的节点作为示例,独立的 `--miner` 节点运作方式与之类似,不再单独阐述。 +--- + +在 v0.8.0 版本中,damocles 支持三种方式独立运行 PoSter 节点,分别是 worker-prover 模式、代理节点模式、ext-prover 模式(外部执行器模式)。 + +## worker-prover 模式 +worker-prover 模式是 v0.8.0 新增的功能,特点是简单,可以非常轻松的支持多机 wdpost。 + +### 基本原理 +worker-prover 模式利用 damocles-worker 计算 window post 证明, 通过 RPC 的方式从 damocles-manager 获取 window post 任务和返回计算的结果。 + +damocles-worker 新增 wdpost planner 用于执行 window post 任务。 +#### Architecture +``` + +-----------------------------------+ + | damocles-manager daemon | + | with --worker-prover falg | + | | + | +-----------------+ | + | |damocles-manager | | + | | poster module | | + | +-------+-^-------+ | + | send | |recv | + | | | | + | +-------v-+-------+ | + | | worker-prover | | + +--------+--------> module <--------+--------+ + | | +--------^--------+ | | + | | | | | + | +-----------------+-----------------+ | + | | | +-------+--------------------------+--------------------------+------------ + | | | + pull | job pull | job pull | job + push | res pull | res pull | res + by | rpc by | rpc by | rpc + | | | ++------+--------+ +-------+-------+ +------+--------+ +|damocles-worker| |damocles-worker| |damocles-worker| +|wdpost planner | |wdpost planner | ... |wdpost planner | ++---------------+ +---------------+ +---------------+ +``` + +### damocles-manager 配置与启动 + +新增配置: +```toml +# ~/.damocles-manager/sector-manager.cfg + +# ... + +[Common.Proving.WorkerProver] +# WindowPoSt 任务的最大尝试次数, 可选项, 数字类型 +# 默认值为 2 +# 尝试次数超过 JobMaxTry 的 WindowPoSt 任务只能通过手动 reset 的方式被重新执行 +JobMaxTry = 2 +# WindowPoSt 任务的心跳超时时间, 可选项, 时间字符串类型 +# 默认值为 15s +# 超过此时间没有发送心跳的任务将会被设置为失败并重试 +HeartbeatTimeout = "15s" +# WindowPoSt 任务的心跳超时时间, 可选项, 时间字符串类型 +# 默认值为 25h +# 创建时间超过此时间的 WindowPoSt 任务将会被删除 +JobLifetime = "25h0m0s" + +# ... +``` + +启动 damocles-manager 进程: +```sh +# --miner flag 可选添加,表示启动 miner 模块用于执行 WinningPoSt 并出块 +# --poster flag 必须添加,表示启动 WindowPoSt 模块 +# --worker-prover 必须添加,表示使用 WorkerProver 模块执行 WindowPoSt +./damocles-manager daemon run --miner --poster --worker-prover +``` + +### damocles-worker 配置 +```toml +[[sealing_thread]] +# 配置使用 wdpost plan +plan = "wdpost" + +[[attached]] +# 配置此 worker 执行 window post 任务过程中会用到的永久存储 +name = "miner-6666-store" +location = "/mnt/miner-6666-store" + +[[processors.wdpost]] +# 使用自定义 wdpost 算法 (可选) +bin="~/my_algorithm" +args = ["window_post"] +envs = { BELLMAN_GPU_INDEXS="0",CUDA_VISIBLE_DEVICES="0", ... } +``` + +##### 一份最简的只启动一个 wdpost sealing_thread 的配置如下: + +```toml +# /path/to/your-damocles-worker-config.toml + +[worker] +name = "damocles-worker-USA-01" + +[sector_manager] +rpc_client.addr = "/ip4/your-damocles-manager-address-here/tcp/1789" + +[[sealing_thread]] +plan = "wdpost" +# 尝试领取任务的时间间隔,默认为 60s, +# 针对 wdpost plan 我们可以调小此值便于更快的领取到新的 wdpost 任务 +sealing.recover_interval = "15s" +# sealing.allowed_miners = [6666] +# sealing.allowed_sizes = ["32GiB"] +# ... + +[[attached]] +name = "miner-6666-store" +location = "/mnt/miner-6666-store" +``` + + +### 管理 window post 任务 +- #### 显示 window post 任务列表 +```sh +# 默认显示未完成的任务和失败的任务, 其中 DDL 字段表示任务的 deadline Index, Try 字段是任务的尝试次数 +./damocles-manager util worker wdpost list + +JobID MinerID DDL Worker State Try CreateAt Elapsed Error +3FgfEnvrub1 1037 3 10.122.63.30 ReadyToRun 1 07-27 16:37:31 - +gbCVH4TUgEf 1037 2 ReadyToRun 0 07-27 16:35:56 - +CrotWCLaXLa 1037 1 10.122.63.30 Succeed 1 07-27 17:19:04 6m38s(done) + +# 显示全部任务 +./damocles-manager util worker wdpost list --all +# ... + +# 显示 window post 任务详细信息 +./damocles-manager util worker wdpost list --detail +# ... +``` + +- #### 重置任务 +当 window post 任务执行失败且自动重试次数达到上限时,可以手动重置任务状态,使其可以继续被 damocles-worker 领取并执行。 +```sh +./damocles-manager util worker wdpost reset gbCVH4TUgEf 3FgfEnvrub1 +``` + +- #### 删除任务 +删除任务和重置任务能达到的效果类似。当执行了删除任务的命令后,damocles-manager 的重试机制会检测当前 deadline 的 window post 任务是否存在于数据库中,如果不存在则会重新下发一遍任务,并记录到数据库中。 + +另外 worker-prover 会自动的定时删除创建时间超过一定时间的任务 (默认为 25 小时,时间可配置)。 +```sh +# 删除指定的任务 +./damocles-manager util worker wdpost remove gbCVH4TUgEf 3FgfEnvrub1 + +# 删除全部任务 +./damocles-manager util worker wdpost remove-all --really-do-it +``` + ## 代理节点模式 我们知道,对于 PoSter 节点来说,最重要的能力是获取实时、准确的扇区定位信息。在当前 `damocles-manager` 版本中,我们暂时仅提供基于本地内嵌式 kv 数据库的元数据管理方式。 @@ -166,7 +323,4 @@ winning post 和 window post 之间不会因设备使用而形成冲突 当目前为止,我们已经讲解了独立 `PoSter` 节点依托的功能、原理和简单的使用范例。 但是,这种模式对于超大规模的 `SP` 集群仍然有一些局限性,具体体现在: -- 除非将配置拆分,让每个 `PoSter` 节点仅针对部分矿工提供时空证明支持,否则难以跨机器提供横向扩展能力; - 时空证明的调度、证明窗口期的严重冲突,仍然需要在一定程度依赖运维层面的调配; - -总体来说,上面这些局限性依赖于完全去状态化、分布式的 `damocles-manager` 实现,这也是我们未来会关注的方向之一。 From 49be5abcd0744f7c67e223af14ca9f95365aa830 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 28 Jul 2023 15:06:48 +0800 Subject: [PATCH 15/18] rebase main --- .../sealing_thread/planner/common/event.rs | 4 - .../sealing_thread/planner/common/sector.rs | 1 - .../sealing/sealing_thread/planner/unseal.rs | 137 ++++-------------- 3 files changed, 26 insertions(+), 116 deletions(-) diff --git a/damocles-worker/src/sealing/sealing_thread/planner/common/event.rs b/damocles-worker/src/sealing/sealing_thread/planner/common/event.rs index ec22ac969..6738249d3 100644 --- a/damocles-worker/src/sealing/sealing_thread/planner/common/event.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/common/event.rs @@ -80,8 +80,6 @@ pub enum Event { UnsealDone(u64), UploadPieceDone, - - UnsealReady, } impl Debug for Event { @@ -150,8 +148,6 @@ impl Debug for Event { Self::UnsealDone(_) => "Unsealed", Self::UploadPieceDone => "UploadPieceDone", - - Self::UnsealReady => "UnsealReady", }; f.write_str(name) diff --git a/damocles-worker/src/sealing/sealing_thread/planner/common/sector.rs b/damocles-worker/src/sealing/sealing_thread/planner/common/sector.rs index 45281400f..b60832d60 100644 --- a/damocles-worker/src/sealing/sealing_thread/planner/common/sector.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/common/sector.rs @@ -82,7 +82,6 @@ def_state! { SnapTreeDBuilt, SnapDone, Unsealed, - UnsealPrepared, } impl std::fmt::Debug for State { diff --git a/damocles-worker/src/sealing/sealing_thread/planner/unseal.rs b/damocles-worker/src/sealing/sealing_thread/planner/unseal.rs index 08e1c205c..c059733a9 100644 --- a/damocles-worker/src/sealing/sealing_thread/planner/unseal.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/unseal.rs @@ -21,9 +21,7 @@ use vc_processors::{ fil_proofs::{UnpaddedByteIndex, UnpaddedBytesAmount}, }; -use crate::sealing::processor::{ - cached_filenames_for_sector, TransferInput, TransferItem, TransferOption, TransferRoute, TransferStoreInfo, -}; +use crate::sealing::processor::{TransferInput, TransferItem, TransferRoute, TransferStoreInfo}; #[derive(Default)] pub struct UnsealPlanner; @@ -46,9 +44,6 @@ impl PlannerTrait for UnsealPlanner { Event::AllocatedUnsealSector(_) => State::Allocated, }, State::Allocated => { - Event::UnsealReady => State::UnsealPrepared, - }, - State::UnsealPrepared => { Event::UnsealDone(_) => State::Unsealed, }, State::Unsealed => { @@ -65,8 +60,7 @@ impl PlannerTrait for UnsealPlanner { match state { State::Empty => inner.acquire_task(), - State::Allocated => inner.pre_unseal(), - State::UnsealPrepared => inner.unseal(), + State::Allocated => inner.unseal(), State::Unsealed => inner.upload_piece(), State::Finished => return Ok(None), @@ -114,123 +108,44 @@ impl<'t> Unseal<'t> { Ok(Event::AllocatedUnsealSector(allocated)) } - fn pre_unseal(&self) -> Result { - let _token = self.task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_TRANSFER).crit()?; + fn unseal(&self) -> Result { + // query token + let _token = self.task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_UNSEAL).crit()?; - // persist store -> thread store let sector_id = self.task.sector_id()?; let proof_type = self.task.sector_proof_type()?; + + field_required!(unseal_info, self.task.sector.phases.unseal_in.as_ref()); field_required!( - access_instance, + instance_name, self.task.sector.finalized.as_ref().map(|f| &f.private.access_instance) ); - debug!("find access store named {}", access_instance); - let access_store = self + debug!("find access store named {}", instance_name); + let instance = self .task .sealing_ctrl .ctx() .global .attached - .get(access_instance) - .with_context(|| format!("get access store instance named {}", access_instance)) - .perm()?; - - debug!("get basic info for access store named {}", access_instance); - let access_store_basic_info = call_rpc! { - self.task.rpc() => store_basic_info(access_instance.clone(),) - }? - .with_context(|| format!("get basic info for store named {}", access_instance)) - .perm()?; - - // sealed file & persisted cache files should be accessed inside persist store - let sealed_file = self.task.sealed_file(sector_id); - sealed_file.prepare().perm()?; - let sealed_rel = sealed_file.rel(); - - let cache_dir = self.task.cache_dir(sector_id); - - let cached_file_routes = cached_filenames_for_sector(proof_type.into()) - .into_iter() - .map(|fname| { - let cached_file = cache_dir.join(fname); - let cached_rel = cached_file.rel(); - - Ok(TransferRoute { - src: TransferItem { - store_name: Some(access_instance.clone()), - uri: access_store - .uri(cached_rel) - .with_context(|| format!("get uri for cache dir {:?} in {}", cached_rel, access_instance)) - .perm()?, - }, - dest: TransferItem { - store_name: None, - uri: cached_file.full().clone(), - }, - opt: Some(TransferOption { - is_dir: false, - allow_link: true, - }), - }) - }) - .collect::, Failure>>()?; - - let mut transfer_routes = vec![TransferRoute { - src: TransferItem { - store_name: Some(access_instance.clone()), - uri: access_store - .uri(sealed_rel) - .with_context(|| format!("get uri for sealed file {:?} in {}", sealed_rel, access_instance)) - .perm()?, - }, - dest: TransferItem { - store_name: None, - uri: sealed_file.full().clone(), - }, - opt: Some(TransferOption { - is_dir: false, - allow_link: true, - }), - }]; - - transfer_routes.extend(cached_file_routes.into_iter()); - - let transfer = TransferInput { - stores: HashMap::from_iter([( - access_instance.clone(), - TransferStoreInfo { - name: access_instance.clone(), - meta: access_store_basic_info.meta, - }, - )]), - routes: transfer_routes, - }; - - self.task - .sealing_ctrl - .ctx() - .global - .processors - .transfer - .process(transfer) - .context("link unseal sector files") + .get(instance_name) + .with_context(|| format!("get access store instance named {}", instance_name)) .perm()?; - Ok(Event::UnsealReady) - } + let sealed_temp = self.task.sealed_file(sector_id); + let sealed_rel = sealed_temp.rel(); - fn unseal(&self) -> Result { - // query token - let _token = self.task.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_UNSEAL).crit()?; + let cache_temp = self.task.cache_dir(sector_id); + let cache_rel = cache_temp.rel(); - let sector_id = self.task.sector_id()?; - let proof_type = self.task.sector_proof_type()?; - - field_required!(unseal_info, self.task.sector.phases.unseal_in.as_ref()); - - let cache_dir = self.task.cache_dir(sector_id); - let sealed_file = self.task.sealed_file(sector_id); + let sealed_path = instance + .uri(sealed_rel) + .with_context(|| format!("get uri for sealed file {:?} in {}", sealed_rel, instance_name)) + .perm()?; + let cache_path = instance + .uri(cache_rel) + .with_context(|| format!("get uri for cache file {:?} in {}", cache_rel, instance_name)) + .perm()?; let piece_file = self.task.piece_file(&unseal_info.piece_cid); if piece_file.full().exists() { @@ -261,8 +176,8 @@ impl<'t> Unseal<'t> { sector_id, comm_d: unseal_info.comm_d, ticket: ticket.ticket.0, - cache_dir: cache_dir.into(), - sealed_file: sealed_file.into(), + cache_dir: cache_path, + sealed_file: sealed_path, unsealed_output: piece_file.into(), offset: UnpaddedByteIndex(unseal_info.offset), num_bytes: UnpaddedBytesAmount(unseal_info.size), From a082196b3d1d3497aa005a4df3daf94672014dcf Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 28 Jul 2023 16:54:40 +0800 Subject: [PATCH 16/18] chore(unified name): wdpost -> window_post --- damocles-worker/src/config.rs | 4 ++-- damocles-worker/src/run.rs | 4 ++-- .../sealing/sealing_thread/planner/wdpost.rs | 6 ++++-- damocles-worker/src/watchdog.rs | 2 +- ...\232\204poster\350\212\202\347\202\271.md" | 20 +++++++++++++------ 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/damocles-worker/src/config.rs b/damocles-worker/src/config.rs index 6823abeb2..a2dbdfb89 100644 --- a/damocles-worker/src/config.rs +++ b/damocles-worker/src/config.rs @@ -210,8 +210,8 @@ pub struct Processors { /// section for unseal processor pub unseal: Option>, - /// section for fetch processor - pub wdpost: Option>, + /// section for window_post processor + pub window_post: Option>, } impl Processors { diff --git a/damocles-worker/src/run.rs b/damocles-worker/src/run.rs index bcaeb416c..9dcfb324f 100644 --- a/damocles-worker/src/run.rs +++ b/damocles-worker/src/run.rs @@ -313,7 +313,7 @@ fn start_processors(cfg: &config::Config, locks: &Arc) -> Result let unseal: processor::ArcUnsealProcessor = construct_sub_processor!(unseal, cfg, locks); - let wdpost: processor::ArcWdPostProcessor = construct_sub_processor!(wdpost, cfg, locks); + let window_post: processor::ArcWdPostProcessor = construct_sub_processor!(window_post, cfg, locks); Ok(GlobalProcessors { add_pieces, @@ -325,7 +325,7 @@ fn start_processors(cfg: &config::Config, locks: &Arc) -> Result snap_prove, transfer, unseal, - wdpost, + window_post, }) } diff --git a/damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs b/damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs index 911ed78dd..6558257ab 100644 --- a/damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs +++ b/damocles-worker/src/sealing/sealing_thread/planner/wdpost.rs @@ -8,7 +8,7 @@ use crossbeam_channel::{bounded, Receiver, Sender}; use jsonrpc_core::ErrorCode; use jsonrpc_core_client::RpcError; use tokio::runtime::Handle; -use vc_processors::builtin::tasks::{PoStReplicaInfo, WindowPoSt, WindowPoStOutput}; +use vc_processors::builtin::tasks::{PoStReplicaInfo, WindowPoSt, WindowPoStOutput, STAGE_NAME_WINDOW_POST}; use crate::logging::warn; use crate::rpc::sealer::{AllocatePoStSpec, AllocatedWdPoStJob, SectorID}; @@ -337,6 +337,8 @@ impl WdPost<'_> { } fn generate(&self) -> Result { + let _token = self.job.sealing_ctrl.ctx().global.limit.acquire(STAGE_NAME_WINDOW_POST).crit()?; + let wdpost_job = self.job.wdpost_job.as_ref().context("wdpost info not found").abort()?; let mut instances = HashMap::new(); @@ -410,7 +412,7 @@ impl WdPost<'_> { replicas: replica, seed: wdpost_job.input.seed, }; - let res = self.job.sealing_ctrl.ctx().global.processors.wdpost.process(post_in); + let res = self.job.sealing_ctrl.ctx().global.processors.window_post.process(post_in); if let Err(e) = &res { tracing::error!(err=?e, job_id=wdpost_job.id,"wdpost error"); } diff --git a/damocles-worker/src/watchdog.rs b/damocles-worker/src/watchdog.rs index 3a6e09b48..9728cd725 100644 --- a/damocles-worker/src/watchdog.rs +++ b/damocles-worker/src/watchdog.rs @@ -61,7 +61,7 @@ pub struct GlobalProcessors { pub snap_prove: ArcSnapProveProcessor, pub transfer: ArcTransferProcessor, pub unseal: ArcUnsealProcessor, - pub wdpost: ArcWdPostProcessor, + pub window_post: ArcWdPostProcessor, } impl Module for Box { diff --git "a/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" "b/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" index 8034180f7..f4b9297d2 100644 --- "a/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" +++ "b/docs/zh/09.\347\213\254\347\253\213\350\277\220\350\241\214\347\232\204poster\350\212\202\347\202\271.md" @@ -37,9 +37,9 @@ damocles-worker 新增 wdpost planner 用于执行 window post 任务。 | | | -------+--------------------------+--------------------------+------------ | | | - pull | job pull | job pull | job - push | res pull | res pull | res - by | rpc by | rpc by | rpc + | pull job | pull job | pull job + | push res | push res | push res + | by rpc | by rpc | by rpc | | | +------+--------+ +-------+-------+ +------+--------+ |damocles-worker| |damocles-worker| |damocles-worker| @@ -91,11 +91,19 @@ plan = "wdpost" name = "miner-6666-store" location = "/mnt/miner-6666-store" -[[processors.wdpost]] -# 使用自定义 wdpost 算法 (可选) + +# 控制 window_post 任务并发 (可选),不配置则不限制 +[processors.limitation.concurrent] +window_post = 2 + +[[processors.window_post]] +# 使用自定义 wdpost 算法 (可选), 如果不配置 bin,则默认使用内置算法 bin="~/my_algorithm" args = ["window_post"] -envs = { BELLMAN_GPU_INDEXS="0",CUDA_VISIBLE_DEVICES="0", ... } +# 配置自定义算法的环境变量 (可选) +envs = { BELLMAN_GPU_INDEXS="0", CUDA_VISIBLE_DEVICES="0", ... } +# 配置本进程最大并发数量 (可选),不配置则不限制 +concurrent = 1 ``` ##### 一份最简的只启动一个 wdpost sealing_thread 的配置如下: From 916a5294bc533132398826b547c67128899d316e Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 28 Jul 2023 17:19:48 +0800 Subject: [PATCH 17/18] chore: make lint happy --- damocles-manager/core/types_wdpost.go | 3 +- .../modules/impl/prover/worker/prover.go | 36 +++++++++---------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go index be18cdd9a..5a0627901 100644 --- a/damocles-manager/core/types_wdpost.go +++ b/damocles-manager/core/types_wdpost.go @@ -74,9 +74,8 @@ func (t *WdPoStJob) DisplayState() string { case WdPoStJobFinished: if t.Succeed() { return "Succeed" - } else { - return "Failed" } + return "Failed" } return t.State } diff --git a/damocles-manager/modules/impl/prover/worker/prover.go b/damocles-manager/modules/impl/prover/worker/prover.go index e514649a6..c7266f596 100644 --- a/damocles-manager/modules/impl/prover/worker/prover.go +++ b/damocles-manager/modules/impl/prover/worker/prover.go @@ -21,7 +21,7 @@ import ( var log = logging.New("worker prover") -var _ core.Prover = (*WorkerProver)(nil) +var _ core.Prover = (*Prover)(nil) var ErrJobRemovedManually = fmt.Errorf("job was manually removed") @@ -49,7 +49,7 @@ type Config struct { JobLifetime time.Duration } -type WorkerProver struct { +type Prover struct { jobMgr core.WorkerWdPoStJobManager sectorTracker core.SectorTracker localProver core.Prover @@ -59,8 +59,8 @@ type WorkerProver struct { config *Config } -func NewProver(jobMgr core.WorkerWdPoStJobManager, sectorTracker core.SectorTracker, config *Config) *WorkerProver { - return &WorkerProver{ +func NewProver(jobMgr core.WorkerWdPoStJobManager, sectorTracker core.SectorTracker, config *Config) *Prover { + return &Prover{ jobMgr: jobMgr, sectorTracker: sectorTracker, localProver: prover.NewProdProver(sectorTracker), @@ -70,13 +70,13 @@ func NewProver(jobMgr core.WorkerWdPoStJobManager, sectorTracker core.SectorTrac } } -func (p *WorkerProver) Start(ctx context.Context) { +func (p *Prover) Start(ctx context.Context) { go p.runNotifyJobDone(ctx) go p.runRetryFailedJobs(ctx) go p.runCleanupExpiredJobs(ctx) } -func (p *WorkerProver) runNotifyJobDone(ctx context.Context) { +func (p *Prover) runNotifyJobDone(ctx context.Context) { ticker := time.NewTicker(3 * time.Second) defer ticker.Stop() for { @@ -98,10 +98,8 @@ func (p *WorkerProver) runNotifyJobDone(ctx context.Context) { } // find all manually deleted jobs - var ( - removed []string - notRemoved map[string]struct{} = make(map[string]struct{}) - ) + var removed []string + notRemoved := make(map[string]struct{}) for _, job := range jobs { notRemoved[job.ID] = struct{}{} } @@ -160,7 +158,7 @@ func (p *WorkerProver) runNotifyJobDone(ctx context.Context) { } } -func (p *WorkerProver) runRetryFailedJobs(ctx context.Context) { +func (p *Prover) runRetryFailedJobs(ctx context.Context) { ticker := time.NewTicker(p.config.RetryFailedJobsInterval) defer ticker.Stop() for { @@ -180,7 +178,7 @@ func (p *WorkerProver) runRetryFailedJobs(ctx context.Context) { } } -func (p *WorkerProver) runCleanupExpiredJobs(ctx context.Context) { +func (p *Prover) runCleanupExpiredJobs(ctx context.Context) { ticker := time.NewTicker(p.config.CleanupExpiredJobsInterval) for { if err := p.jobMgr.CleanupExpiredJobs(ctx, p.config.JobLifetime, 128); err != nil { @@ -196,11 +194,11 @@ func (p *WorkerProver) runCleanupExpiredJobs(ctx context.Context) { } } -func (p *WorkerProver) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { +func (p *Prover) AggregateSealProofs(ctx context.Context, aggregateInfo core.AggregateSealVerifyProofAndInfos, proofs [][]byte) ([]byte, error) { return p.localProver.AggregateSealProofs(ctx, aggregateInfo, proofs) } -func (p *WorkerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { +func (p *Prover) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint64, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []builtin.PoStProof, skipped []abi.SectorID, err error) { randomness[31] &= 0x3f sis := make([]core.WdPoStSectorInfo, len(sectors)) @@ -263,7 +261,7 @@ func (p *WorkerProver) GenerateWindowPoSt(ctx context.Context, deadlineIdx uint6 return proofs, nil, nil } -func (p *WorkerProver) doWindowPoSt(ctx context.Context, deadlineIdx uint64, input core.WdPoStInput) (output *stage.WindowPoStOutput, err error) { +func (p *Prover) doWindowPoSt(ctx context.Context, deadlineIdx uint64, input core.WdPoStInput) (output *stage.WindowPoStOutput, err error) { job, err := p.jobMgr.Create(ctx, deadlineIdx, input) if err != nil { return nil, fmt.Errorf("create wdPoSt job: %w", err) @@ -289,18 +287,18 @@ func (p *WorkerProver) doWindowPoSt(ctx context.Context, deadlineIdx uint64, inp return } -func (p *WorkerProver) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { +func (p *Prover) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, proofType abi.RegisteredPoStProof, sectors []builtin.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]builtin.PoStProof, error) { return p.localProver.GenerateWinningPoSt(ctx, minerID, proofType, sectors, randomness) } -func (p *WorkerProver) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { +func (p *Prover) GeneratePoStFallbackSectorChallenges(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, sectorIds []abi.SectorNumber) (*core.FallbackChallenges, error) { return p.localProver.GeneratePoStFallbackSectorChallenges(ctx, proofType, minerID, randomness, sectorIds) } -func (p *WorkerProver) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { +func (p *Prover) GenerateSingleVanillaProof(ctx context.Context, replica core.FFIPrivateSectorInfo, challenges []uint64) ([]byte, error) { return p.localProver.GenerateSingleVanillaProof(ctx, replica, challenges) } -func (p *WorkerProver) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { +func (p *Prover) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]core.PoStProof, error) { return p.localProver.GenerateWinningPoStWithVanilla(ctx, proofType, minerID, randomness, proofs) } From 554101429cdf8eaaa3efc3a6635c57a458454977 Mon Sep 17 00:00:00 2001 From: 0x5459 <0x5459@protonmail.com> Date: Fri, 28 Jul 2023 18:04:19 +0800 Subject: [PATCH 18/18] pref: reduce the amount of rpc data transfer --- damocles-manager/core/api.go | 2 +- damocles-manager/core/types_wdpost.go | 11 +++++++++++ damocles-manager/go.mod | 2 +- damocles-manager/modules/impl/prover/worker/rpc.go | 14 ++++++++++++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/damocles-manager/core/api.go b/damocles-manager/core/api.go index 3c8dce840..a4799a6a7 100644 --- a/damocles-manager/core/api.go +++ b/damocles-manager/core/api.go @@ -158,5 +158,5 @@ type WorkerWdPoStAPI interface { WdPoStFinishJob(ctx context.Context, jobID string, output *stage.WindowPoStOutput, errorReason string) (Meta, error) WdPoStResetJob(ctx context.Context, jobID string) (Meta, error) WdPoStRemoveJob(ctx context.Context, jobID string) (Meta, error) - WdPoStAllJobs(ctx context.Context) ([]*WdPoStJob, error) + WdPoStAllJobs(ctx context.Context) ([]WdPoStJobBrief, error) } diff --git a/damocles-manager/core/types_wdpost.go b/damocles-manager/core/types_wdpost.go index 5a0627901..fb3677459 100644 --- a/damocles-manager/core/types_wdpost.go +++ b/damocles-manager/core/types_wdpost.go @@ -2,6 +2,7 @@ package core import ( "context" + "encoding/json" "time" "github.com/filecoin-project/go-state-types/abi" @@ -80,6 +81,16 @@ func (t *WdPoStJob) DisplayState() string { return t.State } +type WdPoStJobBrief struct { + *WdPoStJob +} + +func (j *WdPoStJobBrief) MarshalJSON() ([]byte, error) { + j.WdPoStJob.Input = WdPoStInput{} + j.WdPoStJob.Output = &stage.WindowPoStOutput{} + return json.Marshal(&j.WdPoStJob) +} + type WdPoStAllocatedJob struct { ID string `json:"Id"` Input WdPoStInput diff --git a/damocles-manager/go.mod b/damocles-manager/go.mod index bb9659518..c0bd53c10 100644 --- a/damocles-manager/go.mod +++ b/damocles-manager/go.mod @@ -39,6 +39,7 @@ require ( github.com/jbenet/go-random v0.0.0-20190219211222-123a90aedc0c github.com/libp2p/go-libp2p v0.23.4 github.com/mitchellh/go-homedir v1.1.0 + github.com/mr-tron/base58 v1.2.0 github.com/mroth/weightedrand v0.4.1 github.com/multiformats/go-multiaddr v0.8.0 github.com/multiformats/go-multihash v0.2.1 @@ -169,7 +170,6 @@ require ( github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1 // indirect github.com/minio/sha256-simd v1.0.0 // indirect github.com/montanaflynn/stats v0.6.6 // indirect - github.com/mr-tron/base58 v1.2.0 // indirect github.com/multiformats/go-base32 v0.1.0 // indirect github.com/multiformats/go-base36 v0.1.0 // indirect github.com/multiformats/go-multiaddr-dns v0.3.1 // indirect diff --git a/damocles-manager/modules/impl/prover/worker/rpc.go b/damocles-manager/modules/impl/prover/worker/rpc.go index d3879ed3a..4eb2d84f0 100644 --- a/damocles-manager/modules/impl/prover/worker/rpc.go +++ b/damocles-manager/modules/impl/prover/worker/rpc.go @@ -48,6 +48,16 @@ func (api WdPoStAPIImpl) WdPoStRemoveJob(ctx context.Context, jobID string) (cor return nil, err } -func (api WdPoStAPIImpl) WdPoStAllJobs(ctx context.Context) ([]*core.WdPoStJob, error) { - return api.jobMgr.All(ctx, func(_ *core.WdPoStJob) bool { return true }) +func (api WdPoStAPIImpl) WdPoStAllJobs(ctx context.Context) ([]core.WdPoStJobBrief, error) { + jobs, err := api.jobMgr.All(ctx, func(_ *core.WdPoStJob) bool { return true }) + if err != nil { + return nil, err + } + ret := make([]core.WdPoStJobBrief, len(jobs)) + for i, job := range jobs { + ret[i] = core.WdPoStJobBrief{ + WdPoStJob: job, + } + } + return ret, nil }