Skip to content

Commit

Permalink
Merge branch 'backend/improve-comment-scraping'
Browse files Browse the repository at this point in the history
  • Loading branch information
Imafikus committed Oct 10, 2021
2 parents 4671b2a + d83e382 commit a12177b
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 20 deletions.
5 changes: 3 additions & 2 deletions backend/.env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
GITHUB_ACCESS_TOKEN =
GITHUB_USERNAME =
MAX_COMMENT_NUMBER =
ENV =
MAX_COMMENT_NUMBER_PER_REPO =
MAX_COMMENT_NUMBER_GLOBAL =
ENV =
34 changes: 27 additions & 7 deletions backend/comment_extractor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from models import ChoosenComment
from typing import List
from typing import List, Optional
import re
import data_extractor
import github_api
Expand All @@ -15,7 +15,8 @@
logging.basicConfig(format='[%(levelname)s]: %(asctime)s @ %(filename)s/%(funcName)s:%(lineno)d - %(message)s ', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)


MAX_COMMENT_NUMBER = int(os.environ['MAX_COMMENT_NUMBER'])
MAX_COMMENT_NUMBER_PER_REPO = int(os.environ['MAX_COMMENT_NUMBER_PER_REPO'])
MAX_COMMENT_NUMBER_GLOBAL = int(os.environ['MAX_COMMENT_NUMBER_GLOBAL'])
CURRENT_ENV = os.environ['ENV']

test_comments = [
Expand All @@ -41,7 +42,7 @@
),
]

def extract_python_comments(file_content, file_url) -> List[ChoosenComment]:
def extract_single_random_python_comment(file_content, file_url) -> Optional[ChoosenComment]:
comments = re.findall(r'#.*', file_content)
cleaned_comments = []

Expand All @@ -53,8 +54,12 @@ def extract_python_comments(file_content, file_url) -> List[ChoosenComment]:
url=file_url
)
cleaned_comments.append(comment)

if len(cleaned_comments) == 0:
logging.info('No comments found, returning None...')
return None

return cleaned_comments
return random.choice(cleaned_comments)

def get_all_choosen_comments() -> List[ChoosenComment]:

Expand All @@ -66,17 +71,32 @@ def get_all_choosen_comments() -> List[ChoosenComment]:

repos = data_extractor.get_repos_with_supported_languages()
for repo in repos:
logging.info(f'Currently extracting from repo: {repo.name}')

repo_comments = []
files = data_extractor.get_all_files(repo)
language_specific_files = data_extractor.extract_all_language_files(files, repo.language)
logging.info(f'Number of language specific files found: {len(language_specific_files)}')


for f in language_specific_files:

file_content = github_api.get_raw_data(f.download_url)
chosen_comments += extract_python_comments(file_content, f.html_url)
single_comment = extract_single_random_python_comment(file_content, f.html_url)
if single_comment is not None:
repo_comments.append(single_comment)

if len(repo_comments) >= MAX_COMMENT_NUMBER_PER_REPO:
logging.info(f'Max comment number reached for repo: {repo.name}')
break

logging.info(f'Current number of comments: {len(chosen_comments)}')
chosen_comments += repo_comments
logging.info(f'Current comment number: {len(chosen_comments)}')

if len(chosen_comments) >= MAX_COMMENT_NUMBER:
if len(chosen_comments) >= MAX_COMMENT_NUMBER_GLOBAL:
logging.info(f'Max comment number reached globally, wrapping up...')
break

return chosen_comments

def main(message, context):
Expand Down
9 changes: 2 additions & 7 deletions backend/data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
from typing import List
import trending
import github_api
import logging

SUPPORTED_LANGUAGES = ['Python']
EXTENSION_MAPPINGS = {
'Python': 'py'
}
MAX_FILES = 5

def extract_files(data: List[SingleGetContentObj]) -> List[SingleGetContentObj]:
return list(filter(lambda item: item.type == 'file', data))
Expand All @@ -17,7 +15,6 @@ def extract_dirs(data: List[SingleGetContentObj]) -> List[SingleGetContentObj]:
return list(filter(lambda item: item.type == 'dir', data))

def extract_all_language_files(data: List[SingleGetContentObj], language: str) -> List[SingleGetContentObj]:
"Extract all files related to language, uses EXTENSION_MAPPINGS dict"
language_files = []

for f in data:
Expand All @@ -29,23 +26,21 @@ def extract_all_language_files(data: List[SingleGetContentObj], language: str) -

return language_files


def get_repos_with_supported_languages() -> List[trending.Repository]:
"Get all trending repos which have language from SUPPORTED_LANGUAGES listed"
all_repos = trending.extract_trending_repos()
supported_repos = list(filter(lambda repo: repo.language in SUPPORTED_LANGUAGES, all_repos))
return supported_repos

def get_all_files(repo: trending.Repository) -> List[SingleGetContentObj]:
"Extracts up to MAX_FILES from target repo"
content = github_api.get_repo_contents(repo.owner, repo.name)
dirs = extract_dirs(content)

files = extract_files(content)

if len(dirs) == 0:
return files

while dirs != [] and len(files) < MAX_FILES:
while dirs != []:
current_dir = dirs.pop()
new_content = github_api.get_content(current_dir.url)
new_dirs = extract_dirs(new_content)
Expand Down
2 changes: 1 addition & 1 deletion backend/github_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ def get_raw_data(url: str):
return data.text

def get_trending_page():
data = _make_get_request('https://github.com/trending/python?since=daily&spoken_language_code=en') #TODO: Fix this when more than 1 language is supported
data = _make_get_request('https://github.com/trending/python?since=daily&spoken_language_code=en') #TODO: Should be implemented differently when more languages are added
return data.text
3 changes: 2 additions & 1 deletion backend/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ module "data_extractor" {
GITHUB_USERNAME = var.github_username
GITHUB_ACCESS_TOKEN = var.github_access_token

MAX_COMMENT_NUMBER = var.max_comment_number
MAX_COMMENT_NUMBER_PER_REPO = var.max_comment_number_per_repo
MAX_COMMENT_NUMBER_GLOBAL = var.max_comment_number_global
ENV = local.stage
}

Expand Down
10 changes: 8 additions & 2 deletions backend/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ variable "github_access_token" {
description = "Token used when consuming GitHub API"
sensitive = true
}
variable "max_comment_number" {
variable "max_comment_number_per_repo" {
type = string
description = "Max number of comments extracted from a single repo"
default = "20"
default = "10"
}

variable "max_comment_number_global" {
type = string
description = "Max number of comments extracted globally"
default = "100"
}

variable "commit_sha" {
Expand Down

0 comments on commit a12177b

Please sign in to comment.