diff --git a/crawler/src/Tieba/Crawl/Facade/ReplyCrawlFacade.cs b/crawler/src/Tieba/Crawl/Facade/ReplyCrawlFacade.cs index 203cbdc6..d998e8f5 100644 --- a/crawler/src/Tieba/Crawl/Facade/ReplyCrawlFacade.cs +++ b/crawler/src/Tieba/Crawl/Facade/ReplyCrawlFacade.cs @@ -20,6 +20,19 @@ public ReplyCrawlFacade(ILogger logger, _tid = tid; } + protected override void ThrowIfEmptyUsersEmbedInPosts() => + throw new TiebaException($"User list in the response of reply request for fid {Fid}, tid {_tid} is empty."); + + protected override void ParsePostsEmbeddedUsers(List usersEmbedInPosts, IList postsInCurrentResponse) => + ParsedPosts.Values // only mutate posts which occurs in current response + .IntersectBy(postsInCurrentResponse.Select(r => r.Pid), r => r.Pid) + .ForEach(r => + { // fill the values for some field of reply from user list which is out of post list + var author = usersEmbedInPosts.First(u => u.Uid == r.AuthorUid); + r.AuthorManagerType = author.BawuType.NullIfWhiteSpace(); // will be null if he's not a moderator + r.AuthorExpGrade = (ushort)author.LevelId; // will be null when author is a historical anonymous user + }); + protected override void PostParseHook(ReplyResponse response, CrawlRequestFlag flag) { ParsedPosts.Values.ForEach(r => r.Tid = _tid); @@ -43,17 +56,6 @@ protected override void PostParseHook(ReplyResponse response, CrawlRequestFlag f } } } - - var users = data.UserList; - if (!users.Any() && !ParsedPosts.IsEmpty) - throw new TiebaException($"User list in response of reply list for fid {Fid}, tid {_tid} is empty."); - Users.ParseUsers(users); - ParsedPosts.Values.IntersectBy(data.PostList.Select(r => r.Pid), r => r.Pid).ForEach(r => // only mutate posts which occurs in current response - { // fill the values for some field of reply from user list which is out of post list - var author = users.First(u => u.Uid == r.AuthorUid); - r.AuthorManagerType = author.BawuType.NullIfWhiteSpace(); // will be null if he's not a moderator - r.AuthorExpGrade = (ushort)author.LevelId; // will be null when author is a historical anonymous user - }); } protected override void PostCommitSaveHook(SaverChangeSet savedPosts) => diff --git a/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs b/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs index fc8730f2..31ce5151 100644 --- a/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs +++ b/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs @@ -20,7 +20,7 @@ public SubReplyCrawlFacade(ILogger logger, TbmDbContext.New } protected override void ThrowIfEmptyUsersEmbedInPosts() => - throw new TiebaException($"User list in response of sub reply list for fid {Fid}, tid {_tid}, pid {_pid} is empty."); + throw new TiebaException($"User list in the response of sub reply request for fid {Fid}, tid {_tid}, pid {_pid} is empty."); protected override void PostParseHook(SubReplyResponse response, CrawlRequestFlag flag) => ParsedPosts.Values.ForEach(sr => diff --git a/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs b/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs index f036a47c..b8eb0b31 100644 --- a/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs +++ b/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs @@ -43,7 +43,7 @@ protected void ParseLatestRepliers(ThreadResponse.Types.Data data) => .ForEach(u => _latestRepliers[u.Uid] = u); protected override void ThrowIfEmptyUsersEmbedInPosts() => - throw new TiebaException($"User list in response of thread list for fid {Fid} is empty."); + throw new TiebaException($"User list in the response of thread request for fid {Fid} is empty."); protected override void PostParseHook(ThreadResponse response, CrawlRequestFlag flag) { diff --git a/crawler/src/Tieba/Crawl/Parser/ReplyParser.cs b/crawler/src/Tieba/Crawl/Parser/ReplyParser.cs index 304a93f4..e415b408 100644 --- a/crawler/src/Tieba/Crawl/Parser/ReplyParser.cs +++ b/crawler/src/Tieba/Crawl/Parser/ReplyParser.cs @@ -7,7 +7,12 @@ public class ReplyParser : BaseParser private static readonly Regex ImgUrlExtractingRegex = new(@"^https?://(tiebapic|imgsrc)\.baidu\.com/forum/pic/item/(?.*?)\.jpg(\?.*)*$", RegexOptions.Compiled, TimeSpan.FromSeconds(1)); protected override PostId PostIdSelector(ReplyPost post) => post.Pid; - protected override IEnumerable ParsePostsInternal(IEnumerable inPosts, List outUsers) => inPosts.Select(Convert); + protected override IEnumerable ParsePostsInternal(IEnumerable inPosts, List outUsers) => + inPosts.Select(r => + { + outUsers.Add(r.Author); + return Convert(r); + }); protected override ReplyPost Convert(Reply inPost) {