Skip to content

Commit

Permalink
* replace all usages to prop AllAfterInTrackingOrParsed with `AllPa…
Browse files Browse the repository at this point in the history
…rsed`, partial revert b81b64a

* rename prop `AllAfter` to `AllAfterInTrackingOrParsed`
* rename prop `Existing` to `ExistingInTracking`
@ SaverChangeSet.cs

* move entity class `ThreadMissingFirstReply` & `ReplySignature` into nested namespace `Related`
@ crawler

* mark prop `ImageId` as `virtual` to prevent shadowing prop with the same name in nested class `AsKey` @ EntityWithImageId.cs
@ shared
@ c#
  • Loading branch information
n0099 committed Jul 27, 2024
1 parent d4e3f0d commit 50c22e7
Show file tree
Hide file tree
Showing 7 changed files with 15 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// ReSharper disable PropertyCanBeMadeInitOnly.Global
namespace tbm.Crawler.Db.Post;
namespace tbm.Crawler.Db.Post.Related;

public class ReplySignature : RowVersionedEntity
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// ReSharper disable PropertyCanBeMadeInitOnly.Global
namespace tbm.Crawler.Db.Post;
namespace tbm.Crawler.Db.Post.Related;

public class ThreadMissingFirstReply : RowVersionedEntity
{
Expand Down
1 change: 1 addition & 0 deletions c#/crawler/src/GlobalUsings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
global using tbm.Crawler.Db;
global using tbm.Crawler.Db.Post;
global using tbm.Crawler.Db.Post.PostContent;
global using tbm.Crawler.Db.Post.Related;
global using tbm.Crawler.Db.Revision;
global using tbm.Crawler.Db.Revision.Splitting;
global using tbm.Crawler.Tieba;
Expand Down
6 changes: 3 additions & 3 deletions c#/crawler/src/Tieba/Crawl/CrawlPost.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public async Task<SavedThreadsList> CrawlThreads
if (currentPageChangeSet != null)
{
savedThreads.Add(currentPageChangeSet);
var threadsLatestReplyPostedAt = currentPageChangeSet.AllAfter
var threadsLatestReplyPostedAt = currentPageChangeSet.AllParsed
.Select(th => th.LatestReplyPostedAt).ToList();
minLatestReplyPostedAt = threadsLatestReplyPostedAt.Min();
if (crawlingPage == 1)
Expand Down Expand Up @@ -76,7 +76,7 @@ public async Task<SavedRepliesKeyByTid> CrawlReplies
.Aggregate(new HashSet<Tid>(), (shouldCrawl, threads) =>
{
shouldCrawl.UnionWith(threads.NewlyAdded.Select(th => th.Tid));
shouldCrawl.UnionWith(threads.Existing
shouldCrawl.UnionWith(threads.ExistingInTracking
.Where(t => t.Before.ReplyCount != t.After.ReplyCount
|| t.Before.LatestReplyPostedAt != t.After.LatestReplyPostedAt
|| t.Before.LatestReplierId != t.After.LatestReplierId)
Expand Down Expand Up @@ -108,7 +108,7 @@ public async Task CrawlSubReplies(
var (tid, replies) = pair;
shouldCrawl.UnionWith(replies.NewlyAdded
.Where(r => r.SubReplyCount != null).Select(r => (tid, r.Pid)));
shouldCrawl.UnionWith(replies.Existing.Where(t =>
shouldCrawl.UnionWith(replies.ExistingInTracking.Where(t =>
{
var (before, after) = t;
return after.SubReplyCount != null && before.SubReplyCount != after.SubReplyCount;
Expand Down
4 changes: 2 additions & 2 deletions c#/crawler/src/Tieba/Crawl/Saver/SaverChangeSet.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ public class SaverChangeSet<TPostEntity, TParsedPost>(
where TPostEntity : IPost
where TParsedPost : TPostEntity, IPost.IParsed
{
public IReadOnlyCollection<(TPostEntity Before, TPostEntity After)> Existing { get; } = existingBefore
public IReadOnlyCollection<(TPostEntity Before, TPostEntity After)> ExistingInTracking { get; } = existingBefore
.OrderBy(postIdSelector)
.EquiZip(existingAfter
.IntersectBy(existingBefore.Select(postIdSelector), postIdSelector)
Expand All @@ -21,7 +21,7 @@ public class SaverChangeSet<TPostEntity, TParsedPost>(
.ToList().AsReadOnly();

// https://stackoverflow.com/questions/3404975/left-outer-join-in-linq/23558389#23558389
public IReadOnlyCollection<TPostEntity> AllAfter { get; } = (
public IReadOnlyCollection<TPostEntity> AllAfterInTrackingOrParsed { get; } = (
from notTracked in parsed
join inTracking in existingAfter
on postIdSelector(notTracked) equals postIdSelector(inTracking) into inTrackings
Expand Down
10 changes: 5 additions & 5 deletions c#/crawler/src/Worker/ArchiveCrawlWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ string GetHumanizedElapsedTimeThenRestart()
if (cancellationToken.IsCancellationRequested) return;
var savedThreads = await CrawlThreads((Page)page, _forumName, _fid, cancellationToken);
if (savedThreads == null) return;
var savedThreadCount = savedThreads.AllAfter.Count;
var savedThreadCount = savedThreads.AllParsed.Count;
logger.LogInformation("Archive for {} threads in the page {} of forum {} finished after {:F2}s",
savedThreadCount, page, _forumName, GetHumanizedElapsedTimeThenRestart());
_ = Interlocked.Add(ref totalSavedThreadCount, savedThreadCount);

if (cancellationToken.IsCancellationRequested) return;
var savedReplies = await CrawlReplies(savedThreads, _fid, cancellationToken);
var savedReplyCount = savedReplies.Sum(pair => pair.Value.AllAfter.Count);
var savedReplyCount = savedReplies.Sum(pair => pair.Value.AllParsed.Count);
logger.LogInformation("Archive for {} replies within {} threads in the page {} of forum {} finished after {:F2}s",
savedReplyCount, savedThreadCount, page, _forumName, GetHumanizedElapsedTimeThenRestart());
_ = Interlocked.Add(ref totalSavedReplyCount, savedReplyCount);
Expand Down Expand Up @@ -143,7 +143,7 @@ private async Task<SavedRepliesKeyByTid> CrawlReplies
// we choose TO crawl these rare thread's replies for archive since most thread will have replies
// following sql can figure out existing replies that not matched with parent thread's subReplyNum in db:
// SELECT COUNT(*) FROM tbmc_f{fid}_thread AS T INNER JOIN tbmc_f{fid}_reply AS R ON T.tid = R.tid AND T.replyNum IS NULL
await Task.WhenAll(savedThreads.AllAfter.Select(th => th.Tid).Distinct().Select(async tid =>
await Task.WhenAll(savedThreads.AllParsed.Select(th => th.Tid).Distinct().Select(async tid =>
{
if (stoppingToken.IsCancellationRequested) return;
await using var facadeFactory = replyCrawlFacadeFactory();
Expand All @@ -165,7 +165,7 @@ private async Task<int> CrawlSubReplies
// we choose NOT TO crawl these rare reply's sub replies for archive since most reply won't have sub replies
// following sql can figure out existing sub replies that not matched with parent reply's SubReplyCount in db:
// SELECT COUNT(*) FROM tbmc_f{fid}_reply AS R INNER JOIN tbmc_f{fid}_subReply AS SR ON R.pid = SR.pid AND R.subReplyCount IS NULL
shouldCrawl.UnionWith(replies.AllAfter
shouldCrawl.UnionWith(replies.AllParsed
.Where(r => r.SubReplyCount != null).Select(r => (tid, r.Pid)));
return shouldCrawl;
});
Expand All @@ -178,7 +178,7 @@ await Task.WhenAll(shouldCrawlParentPosts.Select(async t =>
var saved = (await facadeFactory.Value(fid, tid, pid)
.CrawlPageRange(1, stoppingToken: stoppingToken)).SaveCrawled(stoppingToken);
if (saved == null) return;
_ = Interlocked.Add(ref savedSubReplyCount, saved.AllAfter.Count);
_ = Interlocked.Add(ref savedSubReplyCount, saved.AllParsed.Count);
}));
return savedSubReplyCount;
}
Expand Down
4 changes: 2 additions & 2 deletions c#/shared/src/Db/EntityWithImageId.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ namespace tbm.Shared.Db;

public abstract class EntityWithImageId : RowVersionedEntity
{
public uint ImageId { get; set; }
public virtual uint ImageId { get; set; }

public abstract class AsKey : EntityWithImageId
{
[Key] public new uint ImageId { get; set; }
[Key] public override uint ImageId { get; set; }
}
}

0 comments on commit 50c22e7

Please sign in to comment.