Skip to content

Commit

Permalink
Merge pull request #151 from rabinadk1/word_cloud
Browse files Browse the repository at this point in the history
Word cloud for Backend
  • Loading branch information
rabinadk1 authored Apr 14, 2022
2 parents 3e2e7ac + c86acbc commit bb57db0
Show file tree
Hide file tree
Showing 11 changed files with 644 additions and 12 deletions.
5 changes: 5 additions & 0 deletions client/src/components/WordCloud.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
const WordCloud = () => {
return <div>WordCloud</div>;
};

export default WordCloud;
32 changes: 32 additions & 0 deletions client/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1967,6 +1967,18 @@
dependencies:
"@babel/types" "^7.3.0"

"@types/d3-cloud@^1.2.5":
version "1.2.5"
resolved "https://registry.yarnpkg.com/@types/d3-cloud/-/d3-cloud-1.2.5.tgz#0300bedc826aacd505ae6c41c5f8c4ab75c45135"
integrity sha512-vEIER9DsEBUOdpRiwCh3n1qE+cV6h4e1LhxhY2sLt+m8LPNAIkOOhTlqk0JDiBwD+ZPM8ynFAOU3AuPuVYBFBA==
dependencies:
"@types/d3" "^3"

"@types/d3@^3":
version "3.5.47"
resolved "https://registry.yarnpkg.com/@types/d3/-/d3-3.5.47.tgz#b81042fcb0195c583fc037bc857d161469a7d175"
integrity sha512-VkWIQoZXLFdcBGe5pdBKJmTU3fmpXvo/KV6ixvTzOMl1yJ2hbTXpfvsziag0kcaerPDwas2T0vxojwQG3YwivQ==

"@types/eslint@^7.28.2":
version "7.29.0"
resolved "https://registry.npmjs.org/@types/eslint/-/eslint-7.29.0.tgz"
Expand Down Expand Up @@ -3478,6 +3490,14 @@ chart.js@^3.7.1:
resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-3.7.1.tgz#0516f690c6a8680c6c707e31a4c1807a6f400ada"
integrity sha512-8knRegQLFnPQAheZV8MjxIXc5gQEfDFD897BJgv/klO/vtIyFFmgMXrNfgrXpbTr/XbTturxRgxIXx/Y+ASJBA==

chartjs-chart-wordcloud@^3.7.0:
version "3.7.0"
resolved "https://registry.yarnpkg.com/chartjs-chart-wordcloud/-/chartjs-chart-wordcloud-3.7.0.tgz#81c1e92eb4aa30e9cc4c047261aeb8b3afc67daf"
integrity sha512-7LxEmbFoo94zwIAP2VIsQKaeCC0l+RaxagaxyeNQAY4mvLVY9AnbdLNIyC3hlCycezzT4Za8K7Gwoyc42CURtg==
dependencies:
"@types/d3-cloud" "^1.2.5"
d3-cloud "^1.2.5"

chartjs-plugin-zoom@^1.2.1:
version "1.2.1"
resolved "https://registry.yarnpkg.com/chartjs-plugin-zoom/-/chartjs-plugin-zoom-1.2.1.tgz#7e350ba20d907f397d0c055239dcc67d326df705"
Expand Down Expand Up @@ -4217,6 +4237,18 @@ cyclist@^1.0.1:
resolved "https://registry.npmjs.org/cyclist/-/cyclist-1.0.1.tgz"
integrity sha1-WW6WmP0MgOEgOMK4LW6xs1tiJNk=

d3-cloud@^1.2.5:
version "1.2.5"
resolved "https://registry.yarnpkg.com/d3-cloud/-/d3-cloud-1.2.5.tgz#3e91564f2d27fba47fcc7d812eb5081ea24c603d"
integrity sha512-4s2hXZgvs0CoUIw31oBAGrHt9Kt/7P9Ik5HIVzISFiWkD0Ga2VLAuO/emO/z1tYIpE7KG2smB4PhMPfFMJpahw==
dependencies:
d3-dispatch "^1.0.3"

d3-dispatch@^1.0.3:
version "1.0.6"
resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-1.0.6.tgz#00d37bcee4dd8cd97729dd893a0ac29caaba5d58"
integrity sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==

d@1, d@^1.0.1:
version "1.0.1"
resolved "https://registry.npmjs.org/d/-/d-1.0.1.tgz"
Expand Down
2 changes: 2 additions & 0 deletions server/app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .auth import router as auth_router
from .pseudo_tweets import router as pseudo_router
from .tweets import router as tweets_router
from .tweets_common import router as tweets_common_router

from .database import create_tables # isort: skip

Expand All @@ -27,6 +28,7 @@ def on_startup():
app.include_router(auth_router)
app.include_router(pseudo_router)
app.include_router(tweets_router)
app.include_router(tweets_common_router)


# Cache the output for maximum 10 items
Expand Down
2 changes: 1 addition & 1 deletion server/app/pseudo_tweets/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def read_pseudo_tweets(
"""
Read pseudo tweets within the offset and limit
"""
selection = get_filtered_selection(topics, day, month, PseudoTweet)
selection = get_filtered_selection(topics, PseudoTweet, day, month)

# others should be exclusively provided, hence the last check
is_others = topics is not None and len(topics) and topics[0] == Topics.others
Expand Down
2 changes: 1 addition & 1 deletion server/app/tweets/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def read_tweets(
"""
Read tweets within the offset and limit
"""
selection = get_filtered_selection(topics, day, month, Tweet)
selection = get_filtered_selection(topics, Tweet, day, month)

tweets = session.exec(
selection.order_by(Tweet.id.desc()).offset(offset).limit(limit)
Expand Down
10 changes: 10 additions & 0 deletions server/app/tweets_common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from fastapi import APIRouter

router = APIRouter(prefix="/tweets_commons", tags=["tweets_commons"])

# get all the stopwords
with open("stopwords.txt") as fp:
STOP_WORDS = set(fp.read().splitlines())

# Import all routes
from . import routes # noqa
12 changes: 6 additions & 6 deletions server/app/tweets_common/helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import date
from typing import Callable, List, Optional, Tuple, TypeVar
from typing import Callable, Collection, List, Optional, Tuple, TypeVar

from fastapi import HTTPException
from pydantic import PositiveInt
Expand All @@ -14,11 +14,11 @@


def get_filtered_selection(
topics: Optional[List[Topics]],
day: Optional[date],
month: Optional[Month],
topics: Optional[Collection[Topics]],
Model: ModelType,
fields: Optional[List[str]] = None,
day: Optional[date] = None,
month: Optional[Month] = None,
fields: Optional[Collection[str]] = None,
):
"""
Get selection query with filter depending upon topics provided
Expand Down Expand Up @@ -99,7 +99,7 @@ def assert_not_null(tweet: Optional[ModelType], id: PositiveInt, Model: ModelTyp


def get_scalar_select(
Model: ModelType, fields: Optional[List[str]] = None
Model: ModelType, fields: Optional[Collection[str]] = None
) -> Select[tuple]:
"""
Get a select statement for the Model with others column
Expand Down
48 changes: 48 additions & 0 deletions server/app/tweets_common/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from datetime import date
from typing import List, Optional

from fastapi import Depends, Query
from nltk import FreqDist
from sqlmodel import Session, select, union_all

from ..database import get_session
from . import router
from .helper_functions import get_filtered_selection
from .models import PseudoTweet, Topics, Tweet
from .types import Month
from .word_cloud_helper import word_tokenize_nepali


@router.get("/")
def get_word_cloud(
topics: Optional[List[Topics]] = Query(None),
day: Optional[date] = None,
month: Optional[Month] = None,
session: Session = Depends(get_session),
):
"""
Get the word-count distribution within the given time range
"""
fields = ("text",)

tweet_selection = get_filtered_selection(topics, Tweet, day, month, fields)
pseudo_tweet_selection = get_filtered_selection(
topics, PseudoTweet, day, month, fields
)

combined_model = union_all(tweet_selection, pseudo_tweet_selection).subquery().c

# Manually selected the text here, need to change if needed
combined_tweets = session.exec(select(combined_model.text)).all()

# It is a generator of tuples
two_dimensional_tokens = map(word_tokenize_nepali, combined_tweets)

flat_tokens: List[str] = []

for token in two_dimensional_tokens:
flat_tokens.extend(token)

word_freq = FreqDist(flat_tokens)

return word_freq.most_common(100)
47 changes: 47 additions & 0 deletions server/app/tweets_common/word_cloud_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import re
from typing import AnyStr

from nltk.tokenize import word_tokenize

from . import STOP_WORDS

emoj_regex = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002500-\U00002BEF" # chinese char
"\U00002702-\U000027B0"
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001f926-\U0001f937"
"\U00010000-\U0010ffff"
"\u2640-\u2642"
"\u2600-\u2B55"
"\u200d"
"\u23cf"
"\u23e9"
"\u231a"
"\ufe0f" # dingbats
"\u3030"
"]+",
re.UNICODE,
)


def remove_emojis(data: AnyStr) -> AnyStr:
return re.sub(emoj_regex, "", data)


def word_tokenize_nepali(text: str):
text = remove_emojis(text)
text = re.sub(r"\d+", " ", text) # remove any digits
text = re.sub(r"[,)({}[\]\.:;`_–\-``!‘’''“”?\-।/—%\|]+", " ", text)
text = re.sub(
r"\s+", " ", text
) # replace multiple whitespaces with single whitespace
text = text.replace("#", "").replace(
"_", " "
) # remove #, and break words containing underscore
return tuple(token for token in word_tokenize(text) if token not in STOP_WORDS)
4 changes: 0 additions & 4 deletions server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ idna==3.3
iniconfig==1.1.1
joblib==1.1.0
nltk==3.7
numpy==1.22.3
packaging==21.3
pandas==1.4.2
passlib==1.7.4
pluggy==1.0.0
py==1.11.0
Expand All @@ -27,10 +25,8 @@ pydantic==1.9.0
PyJWT==2.3.0
pyparsing==3.0.8
pytest==7.1.1
python-dateutil==2.8.2
python-dotenv==0.20.0
python-multipart==0.0.5
pytz==2022.1
regex==2022.3.15
requests==2.27.1
six==1.16.0
Expand Down
Loading

0 comments on commit bb57db0

Please sign in to comment.