Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find competitor JS with analyzer #905

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions adserver/analyzer/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@
class BaseAnalyzerBackend:
"""Base class that all analyzers should extend."""

# CSS selectors that attempt to find competing ad networks
# This is a predictor of publishers moving off the platform
COMPETITORS_SELECTORS = (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like something that should probably live in our private -ext repo, since I don't know how relevant this is for the OSS codebase? Or at least come from a ENV variable, and not be hard coded.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose I can move it to -ext and create a separate analyzer backend for this specifically

"script[src*='carbonads.com']",
# Google
"script[src*='adsbygoogle.js']",
"script[src*='show_ads.js']",
)

# CSS selectors to select the "main" content of the page
# The first of these to match anything is used
MAIN_CONTENT_SELECTORS = (
Expand Down
13 changes: 13 additions & 0 deletions adserver/analyzer/backends/naive.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""Naive keyword analyzer that is simply based on keyword counts."""

import collections
import logging

from bs4 import BeautifulSoup

from ...models import Topic
from .base import BaseAnalyzerBackend


log = logging.getLogger(__name__) # noqa


class NaiveKeywordAnalyzerBackend(BaseAnalyzerBackend):
"""
A very naive keyword analyzer.
Expand Down Expand Up @@ -36,6 +40,15 @@ def analyze_response(self, resp):

soup = BeautifulSoup(resp.content, features="html.parser")

for selector in self.COMPETITORS_SELECTORS:
results = soup.select(selector)
if results:
log.warning(
"Found competitor ads on publisher page. url=%s, selector=%s",
resp.url,
selector,
)

for selector in self.REMOVE_CONTENT_SELECTORS:
for nodes in soup.select(selector):
nodes.decompose()
Expand Down
Loading