Merge pull request #151 from rabinadk1/word_cloud

Word cloud for Backend
naamiinepal · Apr 14, 2022 · bb57db0 · bb57db0
2 parents 3e2e7ac + c86acbc
commit bb57db0
Show file tree

Hide file tree

Showing 11 changed files with 644 additions and 12 deletions.
diff --git a/client/src/components/WordCloud.jsx b/client/src/components/WordCloud.jsx
@@ -0,0 +1,5 @@
+const WordCloud = () => {
+  return <div>WordCloud</div>;
+};
+
+export default WordCloud;
diff --git a/client/yarn.lock b/client/yarn.lock
@@ -1967,6 +1967,18 @@
   dependencies:
     "@babel/types" "^7.3.0"
 
+"@types/d3-cloud@^1.2.5":
+  version "1.2.5"
+  resolved "https://registry.yarnpkg.com/@types/d3-cloud/-/d3-cloud-1.2.5.tgz#0300bedc826aacd505ae6c41c5f8c4ab75c45135"
+  integrity sha512-vEIER9DsEBUOdpRiwCh3n1qE+cV6h4e1LhxhY2sLt+m8LPNAIkOOhTlqk0JDiBwD+ZPM8ynFAOU3AuPuVYBFBA==
+  dependencies:
+    "@types/d3" "^3"
+
+"@types/d3@^3":
+  version "3.5.47"
+  resolved "https://registry.yarnpkg.com/@types/d3/-/d3-3.5.47.tgz#b81042fcb0195c583fc037bc857d161469a7d175"
+  integrity sha512-VkWIQoZXLFdcBGe5pdBKJmTU3fmpXvo/KV6ixvTzOMl1yJ2hbTXpfvsziag0kcaerPDwas2T0vxojwQG3YwivQ==
+
 "@types/eslint@^7.28.2":
   version "7.29.0"
   resolved "https://registry.npmjs.org/@types/eslint/-/eslint-7.29.0.tgz"
@@ -3478,6 +3490,14 @@ chart.js@^3.7.1:
   resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-3.7.1.tgz#0516f690c6a8680c6c707e31a4c1807a6f400ada"
   integrity sha512-8knRegQLFnPQAheZV8MjxIXc5gQEfDFD897BJgv/klO/vtIyFFmgMXrNfgrXpbTr/XbTturxRgxIXx/Y+ASJBA==
 
+chartjs-chart-wordcloud@^3.7.0:
+  version "3.7.0"
+  resolved "https://registry.yarnpkg.com/chartjs-chart-wordcloud/-/chartjs-chart-wordcloud-3.7.0.tgz#81c1e92eb4aa30e9cc4c047261aeb8b3afc67daf"
+  integrity sha512-7LxEmbFoo94zwIAP2VIsQKaeCC0l+RaxagaxyeNQAY4mvLVY9AnbdLNIyC3hlCycezzT4Za8K7Gwoyc42CURtg==
+  dependencies:
+    "@types/d3-cloud" "^1.2.5"
+    d3-cloud "^1.2.5"
+
 chartjs-plugin-zoom@^1.2.1:
   version "1.2.1"
   resolved "https://registry.yarnpkg.com/chartjs-plugin-zoom/-/chartjs-plugin-zoom-1.2.1.tgz#7e350ba20d907f397d0c055239dcc67d326df705"
@@ -4217,6 +4237,18 @@ cyclist@^1.0.1:
   resolved "https://registry.npmjs.org/cyclist/-/cyclist-1.0.1.tgz"
   integrity sha1-WW6WmP0MgOEgOMK4LW6xs1tiJNk=
 
+d3-cloud@^1.2.5:
+  version "1.2.5"
+  resolved "https://registry.yarnpkg.com/d3-cloud/-/d3-cloud-1.2.5.tgz#3e91564f2d27fba47fcc7d812eb5081ea24c603d"
+  integrity sha512-4s2hXZgvs0CoUIw31oBAGrHt9Kt/7P9Ik5HIVzISFiWkD0Ga2VLAuO/emO/z1tYIpE7KG2smB4PhMPfFMJpahw==
+  dependencies:
+    d3-dispatch "^1.0.3"
+
+d3-dispatch@^1.0.3:
+  version "1.0.6"
+  resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-1.0.6.tgz#00d37bcee4dd8cd97729dd893a0ac29caaba5d58"
+  integrity sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==
+
 d@1, d@^1.0.1:
   version "1.0.1"
   resolved "https://registry.npmjs.org/d/-/d-1.0.1.tgz"

diff --git a/server/app/__init__.py b/server/app/__init__.py
@@ -9,6 +9,7 @@
 from .auth import router as auth_router
 from .pseudo_tweets import router as pseudo_router
 from .tweets import router as tweets_router
+from .tweets_common import router as tweets_common_router
 
 from .database import create_tables  # isort: skip
 
@@ -27,6 +28,7 @@ def on_startup():
 app.include_router(auth_router)
 app.include_router(pseudo_router)
 app.include_router(tweets_router)
+app.include_router(tweets_common_router)
 
 
 # Cache the output for maximum 10 items

diff --git a/server/app/pseudo_tweets/routes.py b/server/app/pseudo_tweets/routes.py
@@ -53,7 +53,7 @@ def read_pseudo_tweets(
     """
     Read pseudo tweets within the offset and limit
     """
-    selection = get_filtered_selection(topics, day, month, PseudoTweet)
+    selection = get_filtered_selection(topics, PseudoTweet, day, month)
 
     # others should be exclusively provided, hence the last check
     is_others = topics is not None and len(topics) and topics[0] == Topics.others

diff --git a/server/app/tweets/routes.py b/server/app/tweets/routes.py
@@ -42,7 +42,7 @@ def read_tweets(
     """
     Read tweets within the offset and limit
     """
-    selection = get_filtered_selection(topics, day, month, Tweet)
+    selection = get_filtered_selection(topics, Tweet, day, month)
 
     tweets = session.exec(
         selection.order_by(Tweet.id.desc()).offset(offset).limit(limit)

diff --git a/server/app/tweets_common/__init__.py b/server/app/tweets_common/__init__.py
@@ -0,0 +1,10 @@
+from fastapi import APIRouter
+
+router = APIRouter(prefix="/tweets_commons", tags=["tweets_commons"])
+
+# get all the stopwords
+with open("stopwords.txt") as fp:
+    STOP_WORDS = set(fp.read().splitlines())
+
+# Import all routes
+from . import routes  # noqa
diff --git a/server/app/tweets_common/helper_functions.py b/server/app/tweets_common/helper_functions.py
@@ -1,5 +1,5 @@
 from datetime import date
-from typing import Callable, List, Optional, Tuple, TypeVar
+from typing import Callable, Collection, List, Optional, Tuple, TypeVar
 
 from fastapi import HTTPException
 from pydantic import PositiveInt
@@ -14,11 +14,11 @@
 
 
 def get_filtered_selection(
-    topics: Optional[List[Topics]],
-    day: Optional[date],
-    month: Optional[Month],
+    topics: Optional[Collection[Topics]],
     Model: ModelType,
-    fields: Optional[List[str]] = None,
+    day: Optional[date] = None,
+    month: Optional[Month] = None,
+    fields: Optional[Collection[str]] = None,
 ):
     """
     Get selection query with filter depending upon topics provided
@@ -99,7 +99,7 @@ def assert_not_null(tweet: Optional[ModelType], id: PositiveInt, Model: ModelTyp
 
 
 def get_scalar_select(
-    Model: ModelType, fields: Optional[List[str]] = None
+    Model: ModelType, fields: Optional[Collection[str]] = None
 ) -> Select[tuple]:
     """
     Get a select statement for the Model with others column

diff --git a/server/app/tweets_common/routes.py b/server/app/tweets_common/routes.py
@@ -0,0 +1,48 @@
+from datetime import date
+from typing import List, Optional
+
+from fastapi import Depends, Query
+from nltk import FreqDist
+from sqlmodel import Session, select, union_all
+
+from ..database import get_session
+from . import router
+from .helper_functions import get_filtered_selection
+from .models import PseudoTweet, Topics, Tweet
+from .types import Month
+from .word_cloud_helper import word_tokenize_nepali
+
+
+@router.get("/")
+def get_word_cloud(
+    topics: Optional[List[Topics]] = Query(None),
+    day: Optional[date] = None,
+    month: Optional[Month] = None,
+    session: Session = Depends(get_session),
+):
+    """
+    Get the word-count distribution within the given time range
+    """
+    fields = ("text",)
+
+    tweet_selection = get_filtered_selection(topics, Tweet, day, month, fields)
+    pseudo_tweet_selection = get_filtered_selection(
+        topics, PseudoTweet, day, month, fields
+    )
+
+    combined_model = union_all(tweet_selection, pseudo_tweet_selection).subquery().c
+
+    # Manually selected the text here, need to change if needed
+    combined_tweets = session.exec(select(combined_model.text)).all()
+
+    # It is a generator of tuples
+    two_dimensional_tokens = map(word_tokenize_nepali, combined_tweets)
+
+    flat_tokens: List[str] = []
+
+    for token in two_dimensional_tokens:
+        flat_tokens.extend(token)
+
+    word_freq = FreqDist(flat_tokens)
+
+    return word_freq.most_common(100)
diff --git a/server/app/tweets_common/word_cloud_helper.py b/server/app/tweets_common/word_cloud_helper.py
@@ -0,0 +1,47 @@
+import re
+from typing import AnyStr
+
+from nltk.tokenize import word_tokenize
+
+from . import STOP_WORDS
+
+emoj_regex = re.compile(
+    "["
+    "\U0001F600-\U0001F64F"  # emoticons
+    "\U0001F300-\U0001F5FF"  # symbols & pictographs
+    "\U0001F680-\U0001F6FF"  # transport & map symbols
+    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+    "\U00002500-\U00002BEF"  # chinese char
+    "\U00002702-\U000027B0"
+    "\U00002702-\U000027B0"
+    "\U000024C2-\U0001F251"
+    "\U0001f926-\U0001f937"
+    "\U00010000-\U0010ffff"
+    "\u2640-\u2642"
+    "\u2600-\u2B55"
+    "\u200d"
+    "\u23cf"
+    "\u23e9"
+    "\u231a"
+    "\ufe0f"  # dingbats
+    "\u3030"
+    "]+",
+    re.UNICODE,
+)
+
+
+def remove_emojis(data: AnyStr) -> AnyStr:
+    return re.sub(emoj_regex, "", data)
+
+
+def word_tokenize_nepali(text: str):
+    text = remove_emojis(text)
+    text = re.sub(r"\d+", " ", text)  # remove any digits
+    text = re.sub(r"[,)({}[\]\.:;`_–\-``!‘’''“”?\-।/—%\|]+", " ", text)
+    text = re.sub(
+        r"\s+", " ", text
+    )  # replace multiple whitespaces with single whitespace
+    text = text.replace("#", "").replace(
+        "_", " "
+    )  # remove #, and break words containing underscore
+    return tuple(token for token in word_tokenize(text) if token not in STOP_WORDS)
diff --git a/server/requirements.txt b/server/requirements.txt
@@ -16,9 +16,7 @@ idna==3.3
 iniconfig==1.1.1
 joblib==1.1.0
 nltk==3.7
-numpy==1.22.3
 packaging==21.3
-pandas==1.4.2
 passlib==1.7.4
 pluggy==1.0.0
 py==1.11.0
@@ -27,10 +25,8 @@ pydantic==1.9.0
 PyJWT==2.3.0
 pyparsing==3.0.8
 pytest==7.1.1
-python-dateutil==2.8.2
 python-dotenv==0.20.0
 python-multipart==0.0.5
-pytz==2022.1
 regex==2022.3.15
 requests==2.27.1
 six==1.16.0