Skip to content

Commit

Permalink
Fix #346 LikedBy crawler not working
Browse files Browse the repository at this point in the history
  • Loading branch information
thomas694 committed Sep 15, 2022
1 parent b721c00 commit 484b7ff
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 106 deletions.
4 changes: 2 additions & 2 deletions src/TumblThree/SharedAssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@

[assembly: ComVisible(false)]
[assembly: NeutralResourcesLanguage("en-US", UltimateResourceFallbackLocation.MainAssembly)]
[assembly: AssemblyVersion("2.8.0.0")]
[assembly: AssemblyFileVersion("2.8.0.0")]
[assembly: AssemblyVersion("2.8.1.0")]
[assembly: AssemblyFileVersion("2.8.1.0")]
295 changes: 191 additions & 104 deletions src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ private async Task CrawlPageAsync(int crawlerNumber)
{
try
{
var isLikesUrl = TumblrLikedByBlog.IsLikesUrl(Blog.Url);

while (true)
{
if (CheckIfShouldStop())
Expand All @@ -157,6 +159,10 @@ private async Task CrawlPageAsync(int crawlerNumber)
try
{
document = await GetRequestAsync(url);
if (!isLikesUrl)
{
document = Regex.Unescape(document);
}
}
catch (Exception ex)
{
Expand All @@ -176,7 +182,7 @@ private async Task CrawlPageAsync(int crawlerNumber)
pagination = ExtractNextPageLink(document);
pageNumber++;
var notWithinTimespan = !CheckIfWithinTimespan(pagination);
if (TumblrLikedByBlog.IsLikesUrl(Blog.Url))
if (isLikesUrl)
{
if (pagination >= prevPagination)
{
Expand All @@ -185,10 +191,17 @@ private async Task CrawlPageAsync(int crawlerNumber)
}
prevPagination = pagination;
}
nextPage.Add(Blog.Url + (TumblrLikedByBlog.IsLikesUrl(Blog.Url) ? "?before=" : "/page/" + pageNumber + "/") + pagination);
nextPage.Add(Blog.Url + (isLikesUrl ? "?before=" : "/page/" + pageNumber + "/") + pagination);

var posts = ExtractPosts(document);
await DownloadPage(posts);
if (isLikesUrl)
{
var posts = ExtractPosts(document);
await DownloadPage(posts);
}
else
{
await AddUrlsToDownloadListAsync(document);
}

Interlocked.Increment(ref numberOfPagesCrawled);
UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);
Expand All @@ -213,6 +226,8 @@ private async Task CrawlPageAsync(int crawlerNumber)
}
}

#region "Likes download"

private static string InlineSearch(DataModels.TumblrSearchJson.Data post, DataModels.TumblrSearchJson.Content content)
{
string text = post.Summary;
Expand Down Expand Up @@ -293,72 +308,6 @@ private bool PostWithinTimespan(DataModels.TumblrSearchJson.Data post)
return downloadFromUnixTime < postTime && postTime < downloadToUnixTime;
}

public override async Task IsBlogOnlineAsync()
{
try
{
await GetRequestAsync(Blog.Url);
Blog.Online = true;
}
catch (WebException webException)
{
if (webException.Status == WebExceptionStatus.RequestCanceled)
{
return;
}

Logger.Error("TumblrLikedByCrawler:IsBlogOnlineAsync:WebException {0}", webException);
ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name);
Blog.Online = false;
}
catch (TimeoutException timeoutException)
{
HandleTimeoutException(timeoutException, Resources.OnlineChecking);
Blog.Online = false;
}
catch (Exception ex) when (ex.Message == "Acceptance of privacy consent needed!")
{
Blog.Online = false;
}
}

private long CreateStartPagination()
{
if (string.IsNullOrEmpty(Blog.DownloadTo))
{
return DateTimeOffset.Now.ToUnixTimeSeconds();
}

DateTime downloadTo = DateTime.ParseExact(Blog.DownloadTo, "yyyyMMdd", CultureInfo.InvariantCulture,
DateTimeStyles.None);
var dateTimeOffset = new DateTimeOffset(downloadTo);
return dateTimeOffset.ToUnixTimeSeconds();
}

private bool CheckIfPageCountReached(int pageCount)
{
int numberOfPages = RangeToSequence(Blog.DownloadPages).Count();
return pageCount >= numberOfPages;
}

private async Task<bool> CheckIfLoggedInAsync()
{
try
{
string document = await GetRequestAsync(Blog.Url + "/page/1");
return !document.Contains("<div class=\"signup_view account login\"");
}
catch (WebException webException) when (webException.Status == WebExceptionStatus.RequestCanceled)
{
return true;
}
catch (TimeoutException timeoutException)
{
HandleTimeoutException(timeoutException, Resources.Crawling);
return false;
}
}

private static List<DataModels.TumblrSearchJson.Data> ExtractPosts(string document)
{
var extracted = extractJsonFromLikes.Match(document).Groups[1].Value;
Expand Down Expand Up @@ -417,40 +366,6 @@ private async Task DownloadPage(List<DataModels.TumblrSearchJson.Data> posts)
}
}

private static long ExtractNextPageLink(string document)
{
// Example pagination:
//
// <div id="pagination" class="pagination "><a id="previous_page_link" href="/liked/by/wallpaperfx/page/3/-1457140452" class="previous button chrome">Previous</a>
// <a id="next_page_link" href="/liked/by/wallpaperfx/page/5/1457139681" class="next button chrome blue">Next</a></div></div>

const string htmlPagination = "(id=\"next_page_link\" href=\"[A-Za-z0-9_/:.-]+/([0-9]+)/([A-Za-z0-9]+))\"";
const string jsonPagination = "&before=([0-9]*)";

long.TryParse(Regex.Match(document, htmlPagination).Groups[3].Value, out var unixTime);

if(unixTime == 0)
{
var r = Regex.Match(document, jsonPagination);
long.TryParse(r.Groups[1].Value, out unixTime);
}

return unixTime;
}

private bool CheckIfWithinTimespan(long pagination)
{
if (string.IsNullOrEmpty(Blog.DownloadFrom))
{
return true;
}

DateTime downloadFrom = DateTime.ParseExact(Blog.DownloadFrom, "yyyyMMdd", CultureInfo.InvariantCulture,
DateTimeStyles.None);
var dateTimeOffset = new DateTimeOffset(downloadFrom);
return pagination >= dateTimeOffset.ToUnixTimeSeconds();
}

private void DownloadText(DataModels.TumblrSearchJson.Data post, Post data)
{
if (Blog.DownloadText && new string[] { "regular", "quote", "note", "link", "conversation" }.Contains(post.OriginalType))
Expand Down Expand Up @@ -551,6 +466,178 @@ private void DownloadMedia(DataModels.TumblrSearchJson.Content content, Post dat
}
}

#endregion

#region "Liked/By download"

private async Task AddUrlsToDownloadListAsync(string document)
{
try
{
AddPhotoUrlToDownloadList(document);
AddVideoUrlToDownloadList(document);
await Task.CompletedTask;
}
catch (NullReferenceException e)
{
Logger.Verbose("TumblrLikedByCrawler.AddUrlsToDownloadListAsync: {0}", e);
}
}

private void AddPhotoUrlToDownloadList(string document)
{
if (!Blog.DownloadPhoto)
{
return;
}

var post = new Post()
{
Date = DateTime.Now.ToString("R"),
DateGmt = DateTime.Now.ToString("R"),
UnixTimestamp = (int)((DateTimeOffset)DateTime.Now).ToUnixTimeSeconds(),
Type = "",
Id = "",
Tags = new List<string>(),
Slug = "",
RegularTitle = "",
RebloggedFromName = "",
RebloggedRootName = "",
ReblogKey = "",
Tumblelog = new TumbleLog2() { Name = "" }
};
AddTumblrPhotoUrl(document, post);

if (Blog.RegExPhotos)
{
AddGenericPhotoUrl(document, post);
}
}

private void AddVideoUrlToDownloadList(string document)
{
if (!Blog.DownloadVideo && !Blog.DownloadVideoThumbnail)
{
return;
}

var post = new Post()
{
Id = "",
Tumblelog = new TumbleLog2() { Name = "" },
UnixTimestamp = (int)((DateTimeOffset)DateTime.Now).ToUnixTimeSeconds()
};
AddTumblrVideoUrl(document, post);
AddInlineTumblrVideoUrl(document, TumblrParser.GetTumblrVVideoUrlRegex(), TumblrParser.GetTumblrThumbnailUrlRegex());

if (Blog.DownloadVideo && Blog.RegExVideos)
{
AddGenericVideoUrl(document, post);
}
}

#endregion

public override async Task IsBlogOnlineAsync()
{
try
{
await GetRequestAsync(Blog.Url);
Blog.Online = true;
}
catch (WebException webException)
{
if (webException.Status == WebExceptionStatus.RequestCanceled)
{
return;
}

Logger.Error("TumblrLikedByCrawler:IsBlogOnlineAsync:WebException {0}", webException);
ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name);
Blog.Online = false;
}
catch (TimeoutException timeoutException)
{
HandleTimeoutException(timeoutException, Resources.OnlineChecking);
Blog.Online = false;
}
catch (Exception ex) when (ex.Message == "Acceptance of privacy consent needed!")
{
Blog.Online = false;
}
}

private long CreateStartPagination()
{
if (string.IsNullOrEmpty(Blog.DownloadTo))
{
return DateTimeOffset.Now.ToUnixTimeSeconds();
}

DateTime downloadTo = DateTime.ParseExact(Blog.DownloadTo, "yyyyMMdd", CultureInfo.InvariantCulture,
DateTimeStyles.None);
var dateTimeOffset = new DateTimeOffset(downloadTo);
return dateTimeOffset.ToUnixTimeSeconds();
}

private bool CheckIfPageCountReached(int pageCount)
{
int numberOfPages = RangeToSequence(Blog.DownloadPages).Count();
return pageCount >= numberOfPages;
}

private async Task<bool> CheckIfLoggedInAsync()
{
try
{
string document = await GetRequestAsync(Blog.Url + "/page/1");
return !document.Contains("<div class=\"signup_view account login\"");
}
catch (WebException webException) when (webException.Status == WebExceptionStatus.RequestCanceled)
{
return true;
}
catch (TimeoutException timeoutException)
{
HandleTimeoutException(timeoutException, Resources.Crawling);
return false;
}
}

private static long ExtractNextPageLink(string document)
{
// Example pagination:
//
// <div id="pagination" class="pagination "><a id="previous_page_link" href="/liked/by/wallpaperfx/page/3/-1457140452" class="previous button chrome">Previous</a>
// <a id="next_page_link" href="/liked/by/wallpaperfx/page/5/1457139681" class="next button chrome blue">Next</a></div></div>

const string htmlPagination = "(id=\"next_page_link\" href=\"[A-Za-z0-9_/:.-]+/([0-9]+)/([A-Za-z0-9]+))\"";
const string jsonPagination = "&before=([0-9]*)";

long.TryParse(Regex.Match(document, htmlPagination).Groups[3].Value, out var unixTime);

if(unixTime == 0)
{
var r = Regex.Match(document, jsonPagination);
long.TryParse(r.Groups[1].Value, out unixTime);
}

return unixTime;
}

private bool CheckIfWithinTimespan(long pagination)
{
if (string.IsNullOrEmpty(Blog.DownloadFrom))
{
return true;
}

DateTime downloadFrom = DateTime.ParseExact(Blog.DownloadFrom, "yyyyMMdd", CultureInfo.InvariantCulture,
DateTimeStyles.None);
var dateTimeOffset = new DateTimeOffset(downloadFrom);
return pagination >= dateTimeOffset.ToUnixTimeSeconds();
}

private async Task GetAlreadyExistingCrawlerDataFilesAsync()
{
foreach (var filepath in Directory.GetFiles(Blog.DownloadLocation(), "*.json"))
Expand Down

0 comments on commit 484b7ff

Please sign in to comment.