Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache fixes and improvements #736

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 41 additions & 26 deletions lib/html_proofer/cache.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,17 @@ def initialize(runner, options)

if blank?(options)
define_singleton_method(:enabled?) { false }
define_singleton_method(:external_enabled?) { false }
define_singleton_method(:internal_enabled?) { false }
else
# we still consider the cache as enabled, regardless of the specic timeframes
define_singleton_method(:enabled?) { true }
setup_cache!(options)

@external_timeframe = parsed_timeframe(options[:timeframe][:external])
define_singleton_method(:external_enabled?) { !@external_timeframe.nil? }
@internal_timeframe = parsed_timeframe(options[:timeframe][:internal])
define_singleton_method(:internal_enabled?) { !@internal_timeframe.nil? }
end
end

Expand All @@ -55,15 +60,15 @@ def parsed_timeframe(timeframe)
end

def add_internal(url, metadata, found)
return unless enabled?
return unless internal_enabled?

@cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil?

@cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found)
end

def add_external(url, filenames, status_code, msg)
return unless enabled?
return unless external_enabled?

found = status_code.between?(200, 299)

Expand All @@ -73,10 +78,10 @@ def add_external(url, filenames, status_code, msg)
end

def detect_url_changes(urls_detected, type)
additions = determine_additions(urls_detected, type)

determine_deletions(urls_detected, type)

additions = determine_additions(urls_detected, type)

additions
end

Expand All @@ -96,13 +101,6 @@ def retrieve_urls(urls_detected, type)

urls_to_check = detect_url_changes(urls_detected, type)

@cache_log[type].each_pair do |url, cache|
within_timeframe = type == :external ? within_external_timeframe?(cache[:time]) : within_internal_timeframe?(cache[:time])
next if within_timeframe

urls_to_check[url] = cache[:metadata] # recheck expired links
end

urls_to_check
end

Expand Down Expand Up @@ -146,7 +144,11 @@ def size(type)
private def determine_external_additions(urls_detected)
urls_detected.reject do |url, _metadata|
if @cache_log[:external].include?(url)
@cache_log[:external][url][:found] # if this is false, we're trying again
found = @cache_log[:external][url][:found] # if this is false, we're trying again
unless found
@logger.log(:debug, "Adding #{url} to external cache (not found)")
end
found
else
@logger.log(:debug, "Adding #{url} to external cache")
false
Expand All @@ -155,40 +157,53 @@ def size(type)
end

private def determine_internal_additions(urls_detected)
urls_detected.each_with_object({}) do |(url, metadata), hsh|
urls_detected.each_with_object({}) do |(url, detected_metadata), hsh|
# url is not even in cache
if @cache_log[:internal][url].nil?
hsh[url] = metadata
@logger.log(:debug, "Adding #{url} to internal cache")
hsh[url] = detected_metadata
next
end

# detect metadata additions
# NOTE: the time-stamp for the whole url key will not be updated,
# so that it reflects the earliest time any of the metadata was checked
cache_metadata = @cache_log[:internal][url][:metadata]
incoming_metadata = urls_detected[url].each_with_object([]) do |incoming_url, arr|
existing_cache_metadata = cache_metadata.find { |k, _| k[:filename] == incoming_url[:filename] }

metadata_additions = detected_metadata.reject do |detected|
existing_cache_metadata = cache_metadata.find { |cached, _| cached[:filename] == detected[:filename] }
# cache for this url, from an existing path, exists as found
if !existing_cache_metadata.nil? && !existing_cache_metadata.empty? && existing_cache_metadata[:found]
metadata.find { |m| m[:filename] == existing_cache_metadata[:filename] }[:found] = true
next
found = !existing_cache_metadata.nil? && !existing_cache_metadata.empty? && existing_cache_metadata[:found]
unless found
@logger.log(:debug, "Adding #{detected} to internal cache for #{url}")
end
found
end

@logger.log(:debug, "Adding #{incoming_url} to internal cache")
arr << incoming_url
if metadata_additions.empty?
next
end

hsh[url] = incoming_metadata
hsh[url] = metadata_additions
# remove from the cache the detected metadata additions as they correspond to failures to be rechecked
# (this works assuming the detected url metadata have "found" set to false)
@cache_log[:internal][url][:metadata] = cache_metadata.difference(metadata_additions)
end
end

# remove from cache URLs that no longer exist
private def determine_deletions(urls_detected, type)
deletions = 0

@cache_log[type].delete_if do |url, _|
if urls_detected.include?(url)
@cache_log[type].delete_if do |url, cache|
expired_timeframe = type == :external ? !within_external_timeframe?(cache[:time]) : !within_internal_timeframe?(cache[:time])
if expired_timeframe
@logger.log(:debug, "Removing #{url} from #{type} cache (expired timeframe)")
deletions += 1
true
elsif urls_detected.include?(url)
false
elsif url_matches_type?(url, type)
@logger.log(:debug, "Removing #{url} from #{type} cache")
@logger.log(:debug, "Removing #{url} from #{type} cache (not detected anymore)")
deletions += 1
true
end
Expand Down
2 changes: 0 additions & 2 deletions lib/html_proofer/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,6 @@ def load_external_cache
@logger.log(:debug, "Found #{cache_text} in the cache")

urls_to_check = @cache.retrieve_urls(ivar, type)
urls_detected = pluralize(urls_to_check.count, "#{type} link", "#{type} links")
@logger.log(:info, "Checking #{urls_detected}")

urls_to_check
end
Expand Down
11 changes: 5 additions & 6 deletions lib/html_proofer/url_validator/external.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,11 @@ def initialize(runner, external_urls)
end

def validate
if @cache.enabled?
urls_to_check = @runner.load_external_cache
run_external_link_checker(urls_to_check)
else
run_external_link_checker(@external_urls)
end
urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls
urls_detected = pluralize(urls_to_check.count, "external link", "external links")
@logger.log(:info, "Checking #{urls_detected}")

run_external_link_checker(urls_to_check)

@failed_checks
end
Expand Down
11 changes: 5 additions & 6 deletions lib/html_proofer/url_validator/internal.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@ def initialize(runner, internal_urls)
end

def validate
if @cache.enabled?
urls_to_check = @runner.load_internal_cache
run_internal_link_checker(urls_to_check)
else
run_internal_link_checker(@internal_urls)
end
urls_to_check = @cache.internal_enabled? ? @runner.load_internal_cache : @internal_urls
urls_detected = pluralize(urls_to_check.count, "internal link", "internal links")
@logger.log(:info, "Checking #{urls_detected}")

run_internal_link_checker(urls_to_check)

@failed_checks
end
Expand Down
64 changes: 63 additions & 1 deletion spec/html-proofer/cache_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,64 @@ def read_cache(cache_filename)
File.delete(cache_filepath) if File.exist?(cache_filepath)
end

it "internal cache is disabled w/o internal timeframe" do
test_file = File.join(cache_fixture_dir, "internal_and_external_example.html")
cache_filename = File.join(version, ".htmlproofer_test.json")
cache_filepath = File.join(cache_fixture_dir, cache_filename)

allow(HTMLProofer::Cache).to(receive(:write).once)

expect_any_instance_of(HTMLProofer::Runner).to_not(receive(:load_internal_cache))

run_proofer(test_file, :file,
cache: { timeframe: { external: "30d" }, cache_file: cache_filename }.merge(default_cache_options))

cache = read_cache(cache_filename)
expect(cache["external"].keys.length).to(eq(1))
expect(cache["internal"].keys.length).to(eq(0))

File.delete(cache_filepath) if File.exist?(cache_filepath)
end

it "external cache is disabled w/o external timeframe" do
test_file = File.join(cache_fixture_dir, "internal_and_external_example.html")
cache_filename = File.join(version, ".htmlproofer_test.json")
cache_filepath = File.join(cache_fixture_dir, cache_filename)

allow(HTMLProofer::Cache).to(receive(:write).once)

expect_any_instance_of(HTMLProofer::Runner).to_not(receive(:load_external_cache))

run_proofer(test_file, :file,
cache: { timeframe: { internal: "30d" }, cache_file: cache_filename }.merge(default_cache_options))

cache = read_cache(cache_filename)
expect(cache["internal"].keys.length).to(eq(1))
expect(cache["external"].keys.length).to(eq(0))

File.delete(cache_filepath) if File.exist?(cache_filepath)
end

it "internal/external cache is disabled w/o internal/external timeframe" do
test_file = File.join(cache_fixture_dir, "internal_and_external_example.html")
cache_filename = File.join(version, ".htmlproofer_test.json")
cache_filepath = File.join(cache_fixture_dir, cache_filename)

allow(HTMLProofer::Cache).to(receive(:write).once)

expect_any_instance_of(HTMLProofer::Runner).to_not(receive(:load_internal_cache))
expect_any_instance_of(HTMLProofer::Runner).to_not(receive(:load_external_cache))

run_proofer(test_file, :file,
cache: { timeframe: {}, cache_file: cache_filename }.merge(default_cache_options))

cache = read_cache(cache_filename)
expect(cache["internal"].keys.length).to(eq(0))
expect(cache["external"].keys.length).to(eq(0))

File.delete(cache_filepath) if File.exist?(cache_filepath)
end

context "external links" do
context "dates" do
let(:cache_filename) { File.join(version, ".within_date_external.json") }
Expand Down Expand Up @@ -285,7 +343,7 @@ def read_cache(cache_filename)

# we expect an add since we are mocking outside the timeframe
expect_any_instance_of(HTMLProofer::Cache).to(receive(:add_internal).with(
"/tel_link.html", { base_url: "", filename: "spec/html-proofer/fixtures/links/working_root_link_internal.html", found: true, line: 5, source: test_file }, true
"/tel_link.html", { base_url: "", filename: "spec/html-proofer/fixtures/links/working_root_link_internal.html", found: false, line: 5, source: test_file }, true
))

run_proofer(test_file, :file, disable_external: true,
Expand Down Expand Up @@ -471,16 +529,20 @@ def read_cache(cache_filename)
cache = read_cache(cache_filename)
external_links = cache["external"]
expect(external_links.keys.first).to(eq("https://github.com/gjtorikian/html-proofer"))
expect(external_links["https://github.com/gjtorikian/html-proofer"]["metadata"].count).to(eq(1))
internal_links = cache["internal"]
expect(internal_links.keys.first).to(eq("/somewhere.html"))
expect(internal_links["/somewhere.html"]["metadata"].count).to(eq(1))

run_proofer(file, :file, cache: { timeframe: { external: "1d", internal: "1d" }, cache_file: cache_filename }.merge(default_cache_options))

cache = read_cache(cache_filename)
external_links = cache["external"]
expect(external_links.keys.first).to(eq("https://github.com/gjtorikian/html-proofer"))
expect(external_links["https://github.com/gjtorikian/html-proofer"]["metadata"].count).to(eq(1))
internal_links = cache["internal"]
expect(internal_links.keys.first).to(eq("/somewhere.html"))
expect(internal_links["/somewhere.html"]["metadata"].count).to(eq(1))
end
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"version":2,"internal":{"/somewhere.html":{"time":"2022-01-06 12:00:00 -0500","metadata":[{"source":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","filename":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","line":11,"base_url":"","found":false},{"source":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","filename":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","line":11,"base_url":"","found":false}]}},"external":{"https://github.com/gjtorikian/html-proofer":{"time":"2022-01-06 12:00:00 -0500","found":true,"status_code":200,"message":"OK","metadata":[{"filename":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","line":7}]}}}
{"version":2,"internal":{"/somewhere.html":{"time":"2022-01-06 12:00:00 -0500","metadata":[{"source":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","filename":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","line":11,"base_url":"","found":false}]}},"external":{"https://github.com/gjtorikian/html-proofer":{"time":"2022-01-06 12:00:00 -0500","found":true,"status_code":200,"message":"OK","metadata":[{"filename":"spec/html-proofer/fixtures/cache/internal_and_external_example.html","line":7}]}}}