naamiinepal · rabinadk1 · Apr 14, 2022 · Apr 13, 2022 · Apr 13, 2022 · Apr 13, 2022
diff --git a/client/package.json b/client/package.json
@@ -13,6 +13,7 @@
     "@types/react-dom": "^18.0.0",
     "axios": "^0.26.1",
     "chart.js": "^3.7.1",
+    "chartjs-chart-wordcloud": "^3.7.0",
     "chartjs-plugin-zoom": "^1.2.1",
     "react": "^17.0.2",
     "react-chartjs-2": "^4.0.1",

diff --git a/client/src/components/WordCloud.jsx b/client/src/components/WordCloud.jsx
@@ -0,0 +1,9 @@
+import { useState } from "react";
+
+const WordCloud = () => {
+  const [wordCount, setWordCount] = useState([]);
+
+  return <div>WordCloud</div>;
+};
+
+export default WordCloud;
diff --git a/client/yarn.lock b/client/yarn.lock
@@ -1967,6 +1967,18 @@
   dependencies:
     "@babel/types" "^7.3.0"
 
+"@types/d3-cloud@^1.2.5":
+  version "1.2.5"
+  resolved "https://registry.yarnpkg.com/@types/d3-cloud/-/d3-cloud-1.2.5.tgz#0300bedc826aacd505ae6c41c5f8c4ab75c45135"
+  integrity sha512-vEIER9DsEBUOdpRiwCh3n1qE+cV6h4e1LhxhY2sLt+m8LPNAIkOOhTlqk0JDiBwD+ZPM8ynFAOU3AuPuVYBFBA==
+  dependencies:
+    "@types/d3" "^3"
+
+"@types/d3@^3":
+  version "3.5.47"
+  resolved "https://registry.yarnpkg.com/@types/d3/-/d3-3.5.47.tgz#b81042fcb0195c583fc037bc857d161469a7d175"
+  integrity sha512-VkWIQoZXLFdcBGe5pdBKJmTU3fmpXvo/KV6ixvTzOMl1yJ2hbTXpfvsziag0kcaerPDwas2T0vxojwQG3YwivQ==
+
 "@types/eslint@^7.28.2":
   version "7.29.0"
   resolved "https://registry.npmjs.org/@types/eslint/-/eslint-7.29.0.tgz"
@@ -3478,6 +3490,14 @@ chart.js@^3.7.1:
   resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-3.7.1.tgz#0516f690c6a8680c6c707e31a4c1807a6f400ada"
   integrity sha512-8knRegQLFnPQAheZV8MjxIXc5gQEfDFD897BJgv/klO/vtIyFFmgMXrNfgrXpbTr/XbTturxRgxIXx/Y+ASJBA==
 
+chartjs-chart-wordcloud@^3.7.0:
+  version "3.7.0"
+  resolved "https://registry.yarnpkg.com/chartjs-chart-wordcloud/-/chartjs-chart-wordcloud-3.7.0.tgz#81c1e92eb4aa30e9cc4c047261aeb8b3afc67daf"
+  integrity sha512-7LxEmbFoo94zwIAP2VIsQKaeCC0l+RaxagaxyeNQAY4mvLVY9AnbdLNIyC3hlCycezzT4Za8K7Gwoyc42CURtg==
+  dependencies:
+    "@types/d3-cloud" "^1.2.5"
+    d3-cloud "^1.2.5"
+
 chartjs-plugin-zoom@^1.2.1:
   version "1.2.1"
   resolved "https://registry.yarnpkg.com/chartjs-plugin-zoom/-/chartjs-plugin-zoom-1.2.1.tgz#7e350ba20d907f397d0c055239dcc67d326df705"
@@ -4217,6 +4237,18 @@ cyclist@^1.0.1:
   resolved "https://registry.npmjs.org/cyclist/-/cyclist-1.0.1.tgz"
   integrity sha1-WW6WmP0MgOEgOMK4LW6xs1tiJNk=
 
+d3-cloud@^1.2.5:
+  version "1.2.5"
+  resolved "https://registry.yarnpkg.com/d3-cloud/-/d3-cloud-1.2.5.tgz#3e91564f2d27fba47fcc7d812eb5081ea24c603d"
+  integrity sha512-4s2hXZgvs0CoUIw31oBAGrHt9Kt/7P9Ik5HIVzISFiWkD0Ga2VLAuO/emO/z1tYIpE7KG2smB4PhMPfFMJpahw==
+  dependencies:
+    d3-dispatch "^1.0.3"
+
+d3-dispatch@^1.0.3:
+  version "1.0.6"
+  resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-1.0.6.tgz#00d37bcee4dd8cd97729dd893a0ac29caaba5d58"
+  integrity sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==
+
 d@1, d@^1.0.1:
   version "1.0.1"
   resolved "https://registry.npmjs.org/d/-/d-1.0.1.tgz"

diff --git a/server/app/__init__.py b/server/app/__init__.py
@@ -9,6 +9,7 @@
 from .auth import router as auth_router
 from .pseudo_tweets import router as pseudo_router
 from .tweets import router as tweets_router
+from .tweets_common import router as tweets_common_router
 
 from .database import create_tables  # isort: skip
 
@@ -27,6 +28,7 @@ def on_startup():
 app.include_router(auth_router)
 app.include_router(pseudo_router)
 app.include_router(tweets_router)
+app.include_router(tweets_common_router)
 
 
 # Cache the output for maximum 10 items

diff --git a/server/app/tweets_common/__init__.py b/server/app/tweets_common/__init__.py
@@ -0,0 +1,10 @@
+from fastapi import APIRouter
+import os
+
+router = APIRouter(prefix="/commons", tags=["commons"])
+# get all the stopwords
+with open("stopwords.txt", mode='r') as fp:
+    STOP_WORDS = set(line.strip() for line in fp)
+
+# Import all routes
+from . import routes  # noqa
diff --git a/server/app/tweets_common/helper_functions.py b/server/app/tweets_common/helper_functions.py
@@ -1,11 +1,18 @@
-from typing import Any, Callable, Optional, Tuple, TypeVar
-
+from typing import Any, Callable, Optional, Tuple, TypeVar, List, Union
+from datetime import date
 from fastapi import HTTPException
 from pydantic import PositiveInt
 from sqlmodel import Integer, Session, and_, func, not_, select, text, union_all
 
 from .models import PseudoTweet, Topics, Tweet, TweetRead, TweetUpdate
 
+import nltk
+import re
+import numpy as np
+
+from . import STOP_WORDS
+
+
 # Make a Generic Type to get the original type completion back
 ModelType = TypeVar("ModelType", Tweet, PseudoTweet)
 
@@ -28,6 +35,38 @@ def get_filtered_selection(filter_topic: Optional[Topics], Model: ModelType):
     return selection
 
 
+def get_filtered_column_selection(
+    filter_topic: Optional[Topics],
+    Model: ModelType, 
+    fields: List[Union[str, date]]
+):
+    """
+    Get selection query with filter depending upon filter_topic and tweet fields
+    """
+    def get_model_attr(field: str):
+        """
+        Convert a field to Model.field
+        """
+        return getattr(Model, field)
+
+    tweet_attr = tuple(map(get_model_attr, fields))
+    selection = select(*tweet_attr)
+
+    if filter_topic is not None:
+        if filter_topic == Topics.others:
+            others_column = get_others_column(Model)
+            filter = (
+                text(
+                    Topics.others
+                ))  # Since others is defined in the selection, directly provide the column
+            selection = select(*tweet_attr, others_column)
+        else:
+            filter = getattr(Model, filter_topic)
+
+        selection = selection.filter(filter)
+    return selection
+
+
 def get_a_tweet(session: Session, tweet_id: PositiveInt, Model: ModelType) -> dict:
     """
     Get a not-None tweet from the database with others column as a dictonary
@@ -158,3 +197,35 @@ def get_model_attr(attr: str):
     )
 
     return get_db_overview(session, all_model)
+
+async def remove_emojis(data: str):
+    emoj = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002500-\U00002BEF"  # chinese char
+        u"\U00002702-\U000027B0"
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u"\U00010000-\U0010ffff"
+        u"\u2640-\u2642" 
+        u"\u2600-\u2B55"
+        u"\u200d"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\ufe0f"  # dingbats
+        u"\u3030"
+                      "]+", re.UNICODE)
+    return re.sub(emoj, '', data)
+
+async def word_tokenize_nepali(text: str):
+    text = await remove_emojis(text)
+    text = re.sub(r"\d+", ' ', text) # remove any digits
+    text = re.sub(r"[,)({}[\]\.:;`_–\-``!‘’''“”?\-।/—%\|]+", ' ', text)
+    text = re.sub(r"\s+", ' ', text) # replace multiple whitespaces with single whitespace
+    text = text.replace("#", "").replace("_", " ") # remove #, and break words containing underscore
+    text_tokens = [token for token in nltk.tokenize.word_tokenize(text) if token not in STOP_WORDS]
+    return np.array(text_tokens)
diff --git a/server/app/tweets_common/routes.py b/server/app/tweets_common/routes.py
@@ -0,0 +1,43 @@
+import typing
+import pandas as pd
+import numpy as np
+import nltk
+
+
+from typing import Optional
+
+from fastapi import Depends
+from sqlmodel import Session
+from datetime import date, timedelta, datetime
+
+from .models import Tweet, Topics, PseudoTweet
+from . import router
+from ..database import get_session
+from .helper_functions import get_filtered_column_selection, word_tokenize_nepali
+
+
+@router.get("/")
+async def get_word_cloud(
+    filter_topic: Optional[Topics] = None,
+    filter_date: Optional[date] = None, # still need to work on this
+    session: Session = Depends(get_session)
+):
+    """
+    Get the word-count distribution within the given time range
+    """
+    last_month = datetime.now() - timedelta(30)
+    print(last_month)
+    selection_tweet = get_filtered_column_selection(filter_topic, PseudoTweet, ['text'])
+    tweets = session.exec(
+        selection_tweet
+    ).all()
+    selection_pseudo_tweet = get_filtered_column_selection(filter_topic, PseudoTweet, ['text'])
+    pseudo_tweets = session.exec(
+        selection_pseudo_tweet
+    ).all()
+
+    tokens = [await word_tokenize_nepali(elem) for elem in tweets+pseudo_tweets]
+    tokens = np.hstack(np.array(tokens, dtype=object)).tolist()
+    word_freq = nltk.FreqDist(tokens)
+
+    return word_freq.most_common(100)