Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Word cloud for Backend #151

Merged
merged 13 commits into from
Apr 14, 2022
Merged
1 change: 1 addition & 0 deletions client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"@types/react-dom": "^18.0.0",
"axios": "^0.26.1",
"chart.js": "^3.7.1",
"chartjs-chart-wordcloud": "^3.7.0",
"chartjs-plugin-zoom": "^1.2.1",
"react": "^17.0.2",
"react-chartjs-2": "^4.0.1",
Expand Down
9 changes: 9 additions & 0 deletions client/src/components/WordCloud.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { useState } from "react";

const WordCloud = () => {
const [wordCount, setWordCount] = useState([]);
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved

return <div>WordCloud</div>;
};

export default WordCloud;
32 changes: 32 additions & 0 deletions client/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1967,6 +1967,18 @@
dependencies:
"@babel/types" "^7.3.0"

"@types/d3-cloud@^1.2.5":
version "1.2.5"
resolved "https://registry.yarnpkg.com/@types/d3-cloud/-/d3-cloud-1.2.5.tgz#0300bedc826aacd505ae6c41c5f8c4ab75c45135"
integrity sha512-vEIER9DsEBUOdpRiwCh3n1qE+cV6h4e1LhxhY2sLt+m8LPNAIkOOhTlqk0JDiBwD+ZPM8ynFAOU3AuPuVYBFBA==
dependencies:
"@types/d3" "^3"

"@types/d3@^3":
version "3.5.47"
resolved "https://registry.yarnpkg.com/@types/d3/-/d3-3.5.47.tgz#b81042fcb0195c583fc037bc857d161469a7d175"
integrity sha512-VkWIQoZXLFdcBGe5pdBKJmTU3fmpXvo/KV6ixvTzOMl1yJ2hbTXpfvsziag0kcaerPDwas2T0vxojwQG3YwivQ==

"@types/eslint@^7.28.2":
version "7.29.0"
resolved "https://registry.npmjs.org/@types/eslint/-/eslint-7.29.0.tgz"
Expand Down Expand Up @@ -3478,6 +3490,14 @@ chart.js@^3.7.1:
resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-3.7.1.tgz#0516f690c6a8680c6c707e31a4c1807a6f400ada"
integrity sha512-8knRegQLFnPQAheZV8MjxIXc5gQEfDFD897BJgv/klO/vtIyFFmgMXrNfgrXpbTr/XbTturxRgxIXx/Y+ASJBA==

chartjs-chart-wordcloud@^3.7.0:
version "3.7.0"
resolved "https://registry.yarnpkg.com/chartjs-chart-wordcloud/-/chartjs-chart-wordcloud-3.7.0.tgz#81c1e92eb4aa30e9cc4c047261aeb8b3afc67daf"
integrity sha512-7LxEmbFoo94zwIAP2VIsQKaeCC0l+RaxagaxyeNQAY4mvLVY9AnbdLNIyC3hlCycezzT4Za8K7Gwoyc42CURtg==
dependencies:
"@types/d3-cloud" "^1.2.5"
d3-cloud "^1.2.5"

chartjs-plugin-zoom@^1.2.1:
version "1.2.1"
resolved "https://registry.yarnpkg.com/chartjs-plugin-zoom/-/chartjs-plugin-zoom-1.2.1.tgz#7e350ba20d907f397d0c055239dcc67d326df705"
Expand Down Expand Up @@ -4217,6 +4237,18 @@ cyclist@^1.0.1:
resolved "https://registry.npmjs.org/cyclist/-/cyclist-1.0.1.tgz"
integrity sha1-WW6WmP0MgOEgOMK4LW6xs1tiJNk=

d3-cloud@^1.2.5:
version "1.2.5"
resolved "https://registry.yarnpkg.com/d3-cloud/-/d3-cloud-1.2.5.tgz#3e91564f2d27fba47fcc7d812eb5081ea24c603d"
integrity sha512-4s2hXZgvs0CoUIw31oBAGrHt9Kt/7P9Ik5HIVzISFiWkD0Ga2VLAuO/emO/z1tYIpE7KG2smB4PhMPfFMJpahw==
dependencies:
d3-dispatch "^1.0.3"

d3-dispatch@^1.0.3:
version "1.0.6"
resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-1.0.6.tgz#00d37bcee4dd8cd97729dd893a0ac29caaba5d58"
integrity sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==

d@1, d@^1.0.1:
version "1.0.1"
resolved "https://registry.npmjs.org/d/-/d-1.0.1.tgz"
Expand Down
2 changes: 2 additions & 0 deletions server/app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .auth import router as auth_router
from .pseudo_tweets import router as pseudo_router
from .tweets import router as tweets_router
from .tweets_common import router as tweets_common_router

from .database import create_tables # isort: skip

Expand All @@ -27,6 +28,7 @@ def on_startup():
app.include_router(auth_router)
app.include_router(pseudo_router)
app.include_router(tweets_router)
app.include_router(tweets_common_router)


# Cache the output for maximum 10 items
Expand Down
10 changes: 10 additions & 0 deletions server/app/tweets_common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from fastapi import APIRouter
import os
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved

router = APIRouter(prefix="/commons", tags=["commons"])
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
# get all the stopwords
with open("stopwords.txt", mode='r') as fp:
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
STOP_WORDS = set(line.strip() for line in fp)
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved

# Import all routes
from . import routes # noqa
75 changes: 73 additions & 2 deletions server/app/tweets_common/helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from typing import Any, Callable, Optional, Tuple, TypeVar

from typing import Any, Callable, Optional, Tuple, TypeVar, List, Union
from datetime import date
from fastapi import HTTPException
from pydantic import PositiveInt
from sqlmodel import Integer, Session, and_, func, not_, select, text, union_all

from .models import PseudoTweet, Topics, Tweet, TweetRead, TweetUpdate

import nltk
import re
import numpy as np

from . import STOP_WORDS


# Make a Generic Type to get the original type completion back
ModelType = TypeVar("ModelType", Tweet, PseudoTweet)

Expand All @@ -28,6 +35,38 @@ def get_filtered_selection(filter_topic: Optional[Topics], Model: ModelType):
return selection


def get_filtered_column_selection(
filter_topic: Optional[Topics],
Model: ModelType,
fields: List[Union[str, date]]
):
"""
Get selection query with filter depending upon filter_topic and tweet fields
"""
def get_model_attr(field: str):
"""
Convert a field to Model.field
"""
return getattr(Model, field)

tweet_attr = tuple(map(get_model_attr, fields))
selection = select(*tweet_attr)

if filter_topic is not None:
if filter_topic == Topics.others:
others_column = get_others_column(Model)
filter = (
text(
Topics.others
)) # Since others is defined in the selection, directly provide the column
selection = select(*tweet_attr, others_column)
else:
filter = getattr(Model, filter_topic)

selection = selection.filter(filter)
return selection


def get_a_tweet(session: Session, tweet_id: PositiveInt, Model: ModelType) -> dict:
"""
Get a not-None tweet from the database with others column as a dictonary
Expand Down Expand Up @@ -158,3 +197,35 @@ def get_model_attr(attr: str):
)

return get_db_overview(session, all_model)

async def remove_emojis(data: str):
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
emoj = re.compile("["
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return re.sub(emoj, '', data)

async def word_tokenize_nepali(text: str):
text = await remove_emojis(text)
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
text = re.sub(r"\d+", ' ', text) # remove any digits
text = re.sub(r"[,)({}[\]\.:;`_–\-``!‘’''“”?\-।/—%\|]+", ' ', text)
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
text = re.sub(r"\s+", ' ', text) # replace multiple whitespaces with single whitespace
text = text.replace("#", "").replace("_", " ") # remove #, and break words containing underscore
text_tokens = [token for token in nltk.tokenize.word_tokenize(text) if token not in STOP_WORDS]
return np.array(text_tokens)
43 changes: 43 additions & 0 deletions server/app/tweets_common/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import typing
import pandas as pd
import numpy as np
import nltk


from typing import Optional

from fastapi import Depends
from sqlmodel import Session
from datetime import date, timedelta, datetime

from .models import Tweet, Topics, PseudoTweet
from . import router
from ..database import get_session
from .helper_functions import get_filtered_column_selection, word_tokenize_nepali


@router.get("/")
async def get_word_cloud(
filter_topic: Optional[Topics] = None,
filter_date: Optional[date] = None, # still need to work on this
session: Session = Depends(get_session)
):
"""
Get the word-count distribution within the given time range
"""
last_month = datetime.now() - timedelta(30)
print(last_month)
selection_tweet = get_filtered_column_selection(filter_topic, PseudoTweet, ['text'])
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
tweets = session.exec(
selection_tweet
).all()
selection_pseudo_tweet = get_filtered_column_selection(filter_topic, PseudoTweet, ['text'])
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
pseudo_tweets = session.exec(
selection_pseudo_tweet
).all()

tokens = [await word_tokenize_nepali(elem) for elem in tweets+pseudo_tweets]
rabinadk1 marked this conversation as resolved.
Show resolved Hide resolved
tokens = np.hstack(np.array(tokens, dtype=object)).tolist()
word_freq = nltk.FreqDist(tokens)

return word_freq.most_common(100)
Loading