Skip to content

Commit

Permalink
* now api responses with embedded author user in replies @ ReplyParse…
Browse files Browse the repository at this point in the history
…r.cs

* move the assignment for field `Reply.Author(ManagerType|ExpGrade)` from `PostParseHook()` into overridden method `ParsePostsEmbeddedUsers()` @ ReplyCrawlFacade.cs
@ crawler
  • Loading branch information
n0099 committed Dec 30, 2022
1 parent 83d9450 commit 31cd3ad
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 14 deletions.
24 changes: 13 additions & 11 deletions crawler/src/Tieba/Crawl/Facade/ReplyCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ public ReplyCrawlFacade(ILogger<ReplyCrawlFacade> logger,
_tid = tid;
}

protected override void ThrowIfEmptyUsersEmbedInPosts() =>
throw new TiebaException($"User list in the response of reply request for fid {Fid}, tid {_tid} is empty.");

protected override void ParsePostsEmbeddedUsers(List<User> usersEmbedInPosts, IList<Reply> postsInCurrentResponse) =>
ParsedPosts.Values // only mutate posts which occurs in current response
.IntersectBy(postsInCurrentResponse.Select(r => r.Pid), r => r.Pid)
.ForEach(r =>
{ // fill the values for some field of reply from user list which is out of post list
var author = usersEmbedInPosts.First(u => u.Uid == r.AuthorUid);
r.AuthorManagerType = author.BawuType.NullIfWhiteSpace(); // will be null if he's not a moderator
r.AuthorExpGrade = (ushort)author.LevelId; // will be null when author is a historical anonymous user
});

protected override void PostParseHook(ReplyResponse response, CrawlRequestFlag flag)
{
ParsedPosts.Values.ForEach(r => r.Tid = _tid);
Expand All @@ -43,17 +56,6 @@ protected override void PostParseHook(ReplyResponse response, CrawlRequestFlag f
}
}
}

var users = data.UserList;
if (!users.Any() && !ParsedPosts.IsEmpty)
throw new TiebaException($"User list in response of reply list for fid {Fid}, tid {_tid} is empty.");
Users.ParseUsers(users);
ParsedPosts.Values.IntersectBy(data.PostList.Select(r => r.Pid), r => r.Pid).ForEach(r => // only mutate posts which occurs in current response
{ // fill the values for some field of reply from user list which is out of post list
var author = users.First(u => u.Uid == r.AuthorUid);
r.AuthorManagerType = author.BawuType.NullIfWhiteSpace(); // will be null if he's not a moderator
r.AuthorExpGrade = (ushort)author.LevelId; // will be null when author is a historical anonymous user
});
}

protected override void PostCommitSaveHook(SaverChangeSet<ReplyPost> savedPosts) =>
Expand Down
2 changes: 1 addition & 1 deletion crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public SubReplyCrawlFacade(ILogger<SubReplyCrawlFacade> logger, TbmDbContext.New
}

protected override void ThrowIfEmptyUsersEmbedInPosts() =>
throw new TiebaException($"User list in response of sub reply list for fid {Fid}, tid {_tid}, pid {_pid} is empty.");
throw new TiebaException($"User list in the response of sub reply request for fid {Fid}, tid {_tid}, pid {_pid} is empty.");

protected override void PostParseHook(SubReplyResponse response, CrawlRequestFlag flag) =>
ParsedPosts.Values.ForEach(sr =>
Expand Down
2 changes: 1 addition & 1 deletion crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ protected void ParseLatestRepliers(ThreadResponse.Types.Data data) =>
.ForEach(u => _latestRepliers[u.Uid] = u);

protected override void ThrowIfEmptyUsersEmbedInPosts() =>
throw new TiebaException($"User list in response of thread list for fid {Fid} is empty.");
throw new TiebaException($"User list in the response of thread request for fid {Fid} is empty.");

protected override void PostParseHook(ThreadResponse response, CrawlRequestFlag flag)
{
Expand Down
7 changes: 6 additions & 1 deletion crawler/src/Tieba/Crawl/Parser/ReplyParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ public class ReplyParser : BaseParser<ReplyPost, Reply>
private static readonly Regex ImgUrlExtractingRegex = new(@"^https?://(tiebapic|imgsrc)\.baidu\.com/forum/pic/item/(?<hash>.*?)\.jpg(\?.*)*$", RegexOptions.Compiled, TimeSpan.FromSeconds(1));
protected override PostId PostIdSelector(ReplyPost post) => post.Pid;

protected override IEnumerable<ReplyPost> ParsePostsInternal(IEnumerable<Reply> inPosts, List<User> outUsers) => inPosts.Select(Convert);
protected override IEnumerable<ReplyPost> ParsePostsInternal(IEnumerable<Reply> inPosts, List<User> outUsers) =>
inPosts.Select(r =>
{
outUsers.Add(r.Author);
return Convert(r);
});

protected override ReplyPost Convert(Reply inPost)
{
Expand Down

0 comments on commit 31cd3ad

Please sign in to comment.