diff --git a/client/src/components/WordCloud.jsx b/client/src/components/WordCloud.jsx
new file mode 100644
index 00000000..e523cc6b
--- /dev/null
+++ b/client/src/components/WordCloud.jsx
@@ -0,0 +1,5 @@
+const WordCloud = () => {
+ return
WordCloud
;
+};
+
+export default WordCloud;
diff --git a/client/yarn.lock b/client/yarn.lock
index 657740a9..0416f301 100644
--- a/client/yarn.lock
+++ b/client/yarn.lock
@@ -1967,6 +1967,18 @@
dependencies:
"@babel/types" "^7.3.0"
+"@types/d3-cloud@^1.2.5":
+ version "1.2.5"
+ resolved "https://registry.yarnpkg.com/@types/d3-cloud/-/d3-cloud-1.2.5.tgz#0300bedc826aacd505ae6c41c5f8c4ab75c45135"
+ integrity sha512-vEIER9DsEBUOdpRiwCh3n1qE+cV6h4e1LhxhY2sLt+m8LPNAIkOOhTlqk0JDiBwD+ZPM8ynFAOU3AuPuVYBFBA==
+ dependencies:
+ "@types/d3" "^3"
+
+"@types/d3@^3":
+ version "3.5.47"
+ resolved "https://registry.yarnpkg.com/@types/d3/-/d3-3.5.47.tgz#b81042fcb0195c583fc037bc857d161469a7d175"
+ integrity sha512-VkWIQoZXLFdcBGe5pdBKJmTU3fmpXvo/KV6ixvTzOMl1yJ2hbTXpfvsziag0kcaerPDwas2T0vxojwQG3YwivQ==
+
"@types/eslint@^7.28.2":
version "7.29.0"
resolved "https://registry.npmjs.org/@types/eslint/-/eslint-7.29.0.tgz"
@@ -3478,6 +3490,14 @@ chart.js@^3.7.1:
resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-3.7.1.tgz#0516f690c6a8680c6c707e31a4c1807a6f400ada"
integrity sha512-8knRegQLFnPQAheZV8MjxIXc5gQEfDFD897BJgv/klO/vtIyFFmgMXrNfgrXpbTr/XbTturxRgxIXx/Y+ASJBA==
+chartjs-chart-wordcloud@^3.7.0:
+ version "3.7.0"
+ resolved "https://registry.yarnpkg.com/chartjs-chart-wordcloud/-/chartjs-chart-wordcloud-3.7.0.tgz#81c1e92eb4aa30e9cc4c047261aeb8b3afc67daf"
+ integrity sha512-7LxEmbFoo94zwIAP2VIsQKaeCC0l+RaxagaxyeNQAY4mvLVY9AnbdLNIyC3hlCycezzT4Za8K7Gwoyc42CURtg==
+ dependencies:
+ "@types/d3-cloud" "^1.2.5"
+ d3-cloud "^1.2.5"
+
chartjs-plugin-zoom@^1.2.1:
version "1.2.1"
resolved "https://registry.yarnpkg.com/chartjs-plugin-zoom/-/chartjs-plugin-zoom-1.2.1.tgz#7e350ba20d907f397d0c055239dcc67d326df705"
@@ -4217,6 +4237,18 @@ cyclist@^1.0.1:
resolved "https://registry.npmjs.org/cyclist/-/cyclist-1.0.1.tgz"
integrity sha1-WW6WmP0MgOEgOMK4LW6xs1tiJNk=
+d3-cloud@^1.2.5:
+ version "1.2.5"
+ resolved "https://registry.yarnpkg.com/d3-cloud/-/d3-cloud-1.2.5.tgz#3e91564f2d27fba47fcc7d812eb5081ea24c603d"
+ integrity sha512-4s2hXZgvs0CoUIw31oBAGrHt9Kt/7P9Ik5HIVzISFiWkD0Ga2VLAuO/emO/z1tYIpE7KG2smB4PhMPfFMJpahw==
+ dependencies:
+ d3-dispatch "^1.0.3"
+
+d3-dispatch@^1.0.3:
+ version "1.0.6"
+ resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-1.0.6.tgz#00d37bcee4dd8cd97729dd893a0ac29caaba5d58"
+ integrity sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA==
+
d@1, d@^1.0.1:
version "1.0.1"
resolved "https://registry.npmjs.org/d/-/d-1.0.1.tgz"
diff --git a/server/app/__init__.py b/server/app/__init__.py
index 9113d8f6..948cd01c 100644
--- a/server/app/__init__.py
+++ b/server/app/__init__.py
@@ -9,6 +9,7 @@
from .auth import router as auth_router
from .pseudo_tweets import router as pseudo_router
from .tweets import router as tweets_router
+from .tweets_common import router as tweets_common_router
from .database import create_tables # isort: skip
@@ -27,6 +28,7 @@ def on_startup():
app.include_router(auth_router)
app.include_router(pseudo_router)
app.include_router(tweets_router)
+app.include_router(tweets_common_router)
# Cache the output for maximum 10 items
diff --git a/server/app/pseudo_tweets/routes.py b/server/app/pseudo_tweets/routes.py
index e4e48085..4e7e1aae 100644
--- a/server/app/pseudo_tweets/routes.py
+++ b/server/app/pseudo_tweets/routes.py
@@ -53,7 +53,7 @@ def read_pseudo_tweets(
"""
Read pseudo tweets within the offset and limit
"""
- selection = get_filtered_selection(topics, day, month, PseudoTweet)
+ selection = get_filtered_selection(topics, PseudoTweet, day, month)
# others should be exclusively provided, hence the last check
is_others = topics is not None and len(topics) and topics[0] == Topics.others
diff --git a/server/app/tweets/routes.py b/server/app/tweets/routes.py
index de6734ad..9e6b7844 100644
--- a/server/app/tweets/routes.py
+++ b/server/app/tweets/routes.py
@@ -42,7 +42,7 @@ def read_tweets(
"""
Read tweets within the offset and limit
"""
- selection = get_filtered_selection(topics, day, month, Tweet)
+ selection = get_filtered_selection(topics, Tweet, day, month)
tweets = session.exec(
selection.order_by(Tweet.id.desc()).offset(offset).limit(limit)
diff --git a/server/app/tweets_common/__init__.py b/server/app/tweets_common/__init__.py
index e69de29b..04bdb509 100644
--- a/server/app/tweets_common/__init__.py
+++ b/server/app/tweets_common/__init__.py
@@ -0,0 +1,10 @@
+from fastapi import APIRouter
+
+router = APIRouter(prefix="/tweets_commons", tags=["tweets_commons"])
+
+# get all the stopwords
+with open("stopwords.txt") as fp:
+ STOP_WORDS = set(fp.read().splitlines())
+
+# Import all routes
+from . import routes # noqa
diff --git a/server/app/tweets_common/helper_functions.py b/server/app/tweets_common/helper_functions.py
index 5b309fad..e8933d63 100644
--- a/server/app/tweets_common/helper_functions.py
+++ b/server/app/tweets_common/helper_functions.py
@@ -1,5 +1,5 @@
from datetime import date
-from typing import Callable, List, Optional, Tuple, TypeVar
+from typing import Callable, Collection, List, Optional, Tuple, TypeVar
from fastapi import HTTPException
from pydantic import PositiveInt
@@ -14,11 +14,11 @@
def get_filtered_selection(
- topics: Optional[List[Topics]],
- day: Optional[date],
- month: Optional[Month],
+ topics: Optional[Collection[Topics]],
Model: ModelType,
- fields: Optional[List[str]] = None,
+ day: Optional[date] = None,
+ month: Optional[Month] = None,
+ fields: Optional[Collection[str]] = None,
):
"""
Get selection query with filter depending upon topics provided
@@ -99,7 +99,7 @@ def assert_not_null(tweet: Optional[ModelType], id: PositiveInt, Model: ModelTyp
def get_scalar_select(
- Model: ModelType, fields: Optional[List[str]] = None
+ Model: ModelType, fields: Optional[Collection[str]] = None
) -> Select[tuple]:
"""
Get a select statement for the Model with others column
diff --git a/server/app/tweets_common/routes.py b/server/app/tweets_common/routes.py
new file mode 100644
index 00000000..5d2ef214
--- /dev/null
+++ b/server/app/tweets_common/routes.py
@@ -0,0 +1,48 @@
+from datetime import date
+from typing import List, Optional
+
+from fastapi import Depends, Query
+from nltk import FreqDist
+from sqlmodel import Session, select, union_all
+
+from ..database import get_session
+from . import router
+from .helper_functions import get_filtered_selection
+from .models import PseudoTweet, Topics, Tweet
+from .types import Month
+from .word_cloud_helper import word_tokenize_nepali
+
+
+@router.get("/")
+def get_word_cloud(
+ topics: Optional[List[Topics]] = Query(None),
+ day: Optional[date] = None,
+ month: Optional[Month] = None,
+ session: Session = Depends(get_session),
+):
+ """
+ Get the word-count distribution within the given time range
+ """
+ fields = ("text",)
+
+ tweet_selection = get_filtered_selection(topics, Tweet, day, month, fields)
+ pseudo_tweet_selection = get_filtered_selection(
+ topics, PseudoTweet, day, month, fields
+ )
+
+ combined_model = union_all(tweet_selection, pseudo_tweet_selection).subquery().c
+
+ # Manually selected the text here, need to change if needed
+ combined_tweets = session.exec(select(combined_model.text)).all()
+
+ # It is a generator of tuples
+ two_dimensional_tokens = map(word_tokenize_nepali, combined_tweets)
+
+ flat_tokens: List[str] = []
+
+ for token in two_dimensional_tokens:
+ flat_tokens.extend(token)
+
+ word_freq = FreqDist(flat_tokens)
+
+ return word_freq.most_common(100)
diff --git a/server/app/tweets_common/word_cloud_helper.py b/server/app/tweets_common/word_cloud_helper.py
new file mode 100644
index 00000000..9e366c5f
--- /dev/null
+++ b/server/app/tweets_common/word_cloud_helper.py
@@ -0,0 +1,47 @@
+import re
+from typing import AnyStr
+
+from nltk.tokenize import word_tokenize
+
+from . import STOP_WORDS
+
+emoj_regex = re.compile(
+ "["
+ "\U0001F600-\U0001F64F" # emoticons
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
+ "\U0001F680-\U0001F6FF" # transport & map symbols
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
+ "\U00002500-\U00002BEF" # chinese char
+ "\U00002702-\U000027B0"
+ "\U00002702-\U000027B0"
+ "\U000024C2-\U0001F251"
+ "\U0001f926-\U0001f937"
+ "\U00010000-\U0010ffff"
+ "\u2640-\u2642"
+ "\u2600-\u2B55"
+ "\u200d"
+ "\u23cf"
+ "\u23e9"
+ "\u231a"
+ "\ufe0f" # dingbats
+ "\u3030"
+ "]+",
+ re.UNICODE,
+)
+
+
+def remove_emojis(data: AnyStr) -> AnyStr:
+ return re.sub(emoj_regex, "", data)
+
+
+def word_tokenize_nepali(text: str):
+ text = remove_emojis(text)
+ text = re.sub(r"\d+", " ", text) # remove any digits
+ text = re.sub(r"[,)({}[\]\.:;`_–\-``!‘’''“”?\-।/—%\|]+", " ", text)
+ text = re.sub(
+ r"\s+", " ", text
+ ) # replace multiple whitespaces with single whitespace
+ text = text.replace("#", "").replace(
+ "_", " "
+ ) # remove #, and break words containing underscore
+ return tuple(token for token in word_tokenize(text) if token not in STOP_WORDS)
diff --git a/server/requirements.txt b/server/requirements.txt
index 938538f4..fe8011c5 100644
--- a/server/requirements.txt
+++ b/server/requirements.txt
@@ -16,9 +16,7 @@ idna==3.3
iniconfig==1.1.1
joblib==1.1.0
nltk==3.7
-numpy==1.22.3
packaging==21.3
-pandas==1.4.2
passlib==1.7.4
pluggy==1.0.0
py==1.11.0
@@ -27,10 +25,8 @@ pydantic==1.9.0
PyJWT==2.3.0
pyparsing==3.0.8
pytest==7.1.1
-python-dateutil==2.8.2
python-dotenv==0.20.0
python-multipart==0.0.5
-pytz==2022.1
regex==2022.3.15
requests==2.27.1
six==1.16.0
diff --git a/server/stopwords.txt b/server/stopwords.txt
new file mode 100644
index 00000000..b24f0388
--- /dev/null
+++ b/server/stopwords.txt
@@ -0,0 +1,492 @@
+अक्सर
+अगाडि
+अगाडी
+अघि
+अझै
+अठार
+अथवा
+अनि
+अनुसार
+अन्तर्गत
+अन्य
+अन्यत्र
+अन्यथा
+अब
+अरु
+अरुलाई
+अरू
+अर्को
+अर्थात
+अर्थात्
+अलग
+अलि
+अवस्था
+अहिले
+आ
+आए
+आएका
+आएको
+आज
+आजको
+आठ
+आत्म
+आदि
+आदिलाई
+आफनो
+आफू
+आफूलाई
+आफै
+आफैँ
+आफ्नै
+आफ्नो
+आयो
+उ
+उक्त
+उदाहरण
+उनको
+उनलाई
+उनले
+उनि
+उनी
+उनीहरुको
+उन्नाइस
+उप
+उसको
+उसलाई
+उसले
+उहालाई
+ऊ
+एउटा
+एउटै
+एक
+एकदम
+एघार
+ओठ
+औ
+औं
+कता
+कति
+कतै
+कम
+कमसेकम
+कसरि
+कसरी
+कसै
+कसैको
+कसैलाई
+कसैले
+कसैसँग
+कस्तो
+कहाँबाट
+कहिलेकाहीं
+का
+काम
+कारण
+कि
+किन
+किनभने
+कुन
+कुनै
+कुन्नी
+कुरा
+कृपया
+के
+केहि
+केही
+को
+कोहि
+कोहिपनि
+कोही
+कोहीपनि
+क्रमशः
+गए
+गएको
+गएर
+गयौ
+गरि
+गरी
+गरे
+गरेका
+गरेको
+गरेर
+गरौं
+गर्छ
+गर्छन्
+गर्छु
+गर्दा
+गर्दै
+गर्न
+गर्नु
+गर्नुपर्छ
+गर्ने
+गैर
+घर
+चार
+चाले
+चाहनुहुन्छ
+चाहन्छु
+चाहिं
+चाहिए
+चाहिंले
+चाहीं
+चाहेको
+चाहेर
+चोटी
+चौथो
+चौध
+छ
+छन
+छन्
+छु
+छू
+छैन
+छैनन्
+छौ
+छौं
+जता
+जताततै
+जना
+जनाको
+जनालाई
+जनाले
+जब
+जबकि
+जबकी
+जसको
+जसबाट
+जसमा
+जसरी
+जसलाई
+जसले
+जस्ता
+जस्तै
+जस्तो
+जस्तोसुकै
+जहाँ
+जान
+जाने
+जाहिर
+जुन
+जुनै
+जे
+जो
+जोपनि
+जोपनी
+झैं
+ठाउँमा
+ठीक
+ठूलो
+त
+तता
+तत्काल
+तथा
+तथापि
+तथापी
+तदनुसार
+तपाइ
+तपाई
+तपाईको
+तब
+तर
+तर्फ
+तल
+तसरी
+तापनि
+तापनी
+तिन
+तिनि
+तिनिहरुलाई
+तिनी
+तिनीहरु
+तिनीहरुको
+तिनीहरू
+तिनीहरूको
+तिनै
+तिमी
+तिर
+तिरको
+ती
+तीन
+तुरन्त
+तुरुन्त
+तुरुन्तै
+तेश्रो
+तेस्कारण
+तेस्रो
+तेह्र
+तैपनि
+तैपनी
+त्यत्तिकै
+त्यत्तिकैमा
+त्यस
+त्यसकारण
+त्यसको
+त्यसले
+त्यसैले
+त्यसो
+त्यस्तै
+त्यस्तो
+त्यहाँ
+त्यहिँ
+त्यही
+त्यहीँ
+त्यहीं
+त्यो
+त्सपछि
+त्सैले
+थप
+थरि
+थरी
+थाहा
+थिए
+थिएँ
+थिएन
+थियो
+दर्ता
+दश
+दिए
+दिएको
+दिन
+दिनुभएको
+दिनुहुन्छ
+दुइ
+दुइवटा
+दुई
+देखि
+देखिन्छ
+देखियो
+देखे
+देखेको
+देखेर
+दोश्री
+दोश्रो
+दोस्रो
+द्वारा
+धन्न
+धेरै
+धौ
+न
+नगर्नु
+नगर्नू
+नजिकै
+नत्र
+नत्रभने
+नभई
+नभएको
+नभनेर
+नयाँ
+नि
+निकै
+निम्ति
+निम्न
+निम्नानुसार
+निर्दिष्ट
+नै
+नौ
+पक्का
+पक्कै
+पछाडि
+पछाडी
+पछि
+पछिल्लो
+पछी
+पटक
+पनि
+पन्ध्र
+पर्छ
+पर्थ्यो
+पर्दैन
+पर्ने
+पर्नेमा
+पर्याप्त
+पहिले
+पहिलो
+पहिल्यै
+पाँच
+पांच
+पाचौँ
+पाँचौं
+पिच्छे
+पूर्व
+पो
+प्रति
+प्रतेक
+प्रत्यक
+प्राय
+प्लस
+फरक
+फेरि
+फेरी
+बढी
+बताए
+बने
+बरु
+बाट
+बारे
+बाहिर
+बाहेक
+बाह्र
+बिच
+बिचमा
+बिरुद्ध
+बिशेष
+बिस
+बीच
+बीचमा
+बीस
+भए
+भएँ
+भएका
+भएकालाई
+भएको
+भएन
+भएर
+भन
+भने
+भनेको
+भनेर
+भन्
+भन्छन्
+भन्छु
+भन्दा
+भन्दै
+भन्नुभयो
+भन्ने
+भन्या
+भयेन
+भयो
+भर
+भरि
+भरी
+भा
+भित्र
+भित्री
+भीत्र
+म
+मध्य
+मध्ये
+मलाई
+मा
+मात्र
+मात्रै
+माथि
+माथी
+मुख्य
+मुनि
+मुन्तिर
+मेरो
+मैले
+यति
+यथोचित
+यदि
+यद्ध्यपि
+यद्यपि
+यस
+यसका
+यसको
+यसपछि
+यसबाहेक
+यसमा
+यसरी
+यसले
+यसो
+यस्तै
+यस्तो
+यहाँ
+यहाँसम्म
+यही
+या
+यी
+यो
+र
+रही
+रहेका
+रहेको
+रहेछ
+राखे
+राख्छ
+राम्रो
+रुपमा
+रूप
+रे
+लगभग
+लगायत
+लाई
+लाख
+लागि
+लागेको
+ले
+वटा
+वरीपरी
+वा
+वाट
+वापत
+वास्तवमा
+शायद
+स
+सक्छ
+सक्ने
+सँग
+संग
+सँगको
+सँगसँगै
+सँगै
+संगै
+सङ्ग
+सङ्गको
+सट्टा
+सत्र
+सधै
+सबै
+सबैको
+सबैलाई
+समय
+समेत
+सम्भव
+सम्म
+सय
+सरह
+सहित
+सहितै
+सही
+साँच्चै
+सात
+साथ
+साथै
+सायद
+सारा
+सुनेको
+सुनेर
+सुरु
+सुरुको
+सुरुमै
+सो
+सोचेको
+सोचेर
+सोही
+सोह्र
+स्थित
+स्पष्ट
+हजार
+हरे
+हरेक
+हामी
+हामीले
+हाम्रा
+हाम्रो
+हुँदैन
+हुन
+हुनत
+हुनु
+हुने
+हुनेछ
+हुन्
+हुन्छ
+हुन्थ्यो
+हैन
+हो
+होइन
+होकि
+होला
+``
+''
\ No newline at end of file