diff --git a/client/src/components/WordCloud.jsx b/client/src/components/WordCloud.jsx new file mode 100644 index 00000000..e523cc6b --- /dev/null +++ b/client/src/components/WordCloud.jsx @@ -0,0 +1,5 @@ +const WordCloud = () => { + return
WordCloud
; +}; + +export default WordCloud; diff --git a/client/yarn.lock b/client/yarn.lock index 657740a9..0416f301 100644 --- a/client/yarn.lock +++ b/client/yarn.lock @@ -1967,6 +1967,18 @@ dependencies: "@babel/types" "^7.3.0" +"@types/d3-cloud@^1.2.5": + version "1.2.5" + resolved "https://registry.yarnpkg.com/@types/d3-cloud/-/d3-cloud-1.2.5.tgz#0300bedc826aacd505ae6c41c5f8c4ab75c45135" + integrity sha512-vEIER9DsEBUOdpRiwCh3n1qE+cV6h4e1LhxhY2sLt+m8LPNAIkOOhTlqk0JDiBwD+ZPM8ynFAOU3AuPuVYBFBA== + dependencies: + "@types/d3" "^3" + +"@types/d3@^3": + version "3.5.47" + resolved "https://registry.yarnpkg.com/@types/d3/-/d3-3.5.47.tgz#b81042fcb0195c583fc037bc857d161469a7d175" + integrity sha512-VkWIQoZXLFdcBGe5pdBKJmTU3fmpXvo/KV6ixvTzOMl1yJ2hbTXpfvsziag0kcaerPDwas2T0vxojwQG3YwivQ== + "@types/eslint@^7.28.2": version "7.29.0" resolved "https://registry.npmjs.org/@types/eslint/-/eslint-7.29.0.tgz" @@ -3478,6 +3490,14 @@ chart.js@^3.7.1: resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-3.7.1.tgz#0516f690c6a8680c6c707e31a4c1807a6f400ada" integrity sha512-8knRegQLFnPQAheZV8MjxIXc5gQEfDFD897BJgv/klO/vtIyFFmgMXrNfgrXpbTr/XbTturxRgxIXx/Y+ASJBA== +chartjs-chart-wordcloud@^3.7.0: + version "3.7.0" + resolved "https://registry.yarnpkg.com/chartjs-chart-wordcloud/-/chartjs-chart-wordcloud-3.7.0.tgz#81c1e92eb4aa30e9cc4c047261aeb8b3afc67daf" + integrity sha512-7LxEmbFoo94zwIAP2VIsQKaeCC0l+RaxagaxyeNQAY4mvLVY9AnbdLNIyC3hlCycezzT4Za8K7Gwoyc42CURtg== + dependencies: + "@types/d3-cloud" "^1.2.5" + d3-cloud "^1.2.5" + chartjs-plugin-zoom@^1.2.1: version "1.2.1" resolved "https://registry.yarnpkg.com/chartjs-plugin-zoom/-/chartjs-plugin-zoom-1.2.1.tgz#7e350ba20d907f397d0c055239dcc67d326df705" @@ -4217,6 +4237,18 @@ cyclist@^1.0.1: resolved "https://registry.npmjs.org/cyclist/-/cyclist-1.0.1.tgz" integrity sha1-WW6WmP0MgOEgOMK4LW6xs1tiJNk= +d3-cloud@^1.2.5: + version "1.2.5" + resolved "https://registry.yarnpkg.com/d3-cloud/-/d3-cloud-1.2.5.tgz#3e91564f2d27fba47fcc7d812eb5081ea24c603d" + integrity sha512-4s2hXZgvs0CoUIw31oBAGrHt9Kt/7P9Ik5HIVzISFiWkD0Ga2VLAuO/emO/z1tYIpE7KG2smB4PhMPfFMJpahw== + dependencies: + d3-dispatch "^1.0.3" + +d3-dispatch@^1.0.3: + version "1.0.6" + resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-1.0.6.tgz#00d37bcee4dd8cd97729dd893a0ac29caaba5d58" + integrity sha512-fVjoElzjhCEy+Hbn8KygnmMS7Or0a9sI2UzGwoB7cCtvI1XpVN9GpoYlnb3xt2YV66oXYb1fLJ8GMvP4hdU1RA== + d@1, d@^1.0.1: version "1.0.1" resolved "https://registry.npmjs.org/d/-/d-1.0.1.tgz" diff --git a/server/app/__init__.py b/server/app/__init__.py index 9113d8f6..948cd01c 100644 --- a/server/app/__init__.py +++ b/server/app/__init__.py @@ -9,6 +9,7 @@ from .auth import router as auth_router from .pseudo_tweets import router as pseudo_router from .tweets import router as tweets_router +from .tweets_common import router as tweets_common_router from .database import create_tables # isort: skip @@ -27,6 +28,7 @@ def on_startup(): app.include_router(auth_router) app.include_router(pseudo_router) app.include_router(tweets_router) +app.include_router(tweets_common_router) # Cache the output for maximum 10 items diff --git a/server/app/pseudo_tweets/routes.py b/server/app/pseudo_tweets/routes.py index e4e48085..4e7e1aae 100644 --- a/server/app/pseudo_tweets/routes.py +++ b/server/app/pseudo_tweets/routes.py @@ -53,7 +53,7 @@ def read_pseudo_tweets( """ Read pseudo tweets within the offset and limit """ - selection = get_filtered_selection(topics, day, month, PseudoTweet) + selection = get_filtered_selection(topics, PseudoTweet, day, month) # others should be exclusively provided, hence the last check is_others = topics is not None and len(topics) and topics[0] == Topics.others diff --git a/server/app/tweets/routes.py b/server/app/tweets/routes.py index de6734ad..9e6b7844 100644 --- a/server/app/tweets/routes.py +++ b/server/app/tweets/routes.py @@ -42,7 +42,7 @@ def read_tweets( """ Read tweets within the offset and limit """ - selection = get_filtered_selection(topics, day, month, Tweet) + selection = get_filtered_selection(topics, Tweet, day, month) tweets = session.exec( selection.order_by(Tweet.id.desc()).offset(offset).limit(limit) diff --git a/server/app/tweets_common/__init__.py b/server/app/tweets_common/__init__.py index e69de29b..04bdb509 100644 --- a/server/app/tweets_common/__init__.py +++ b/server/app/tweets_common/__init__.py @@ -0,0 +1,10 @@ +from fastapi import APIRouter + +router = APIRouter(prefix="/tweets_commons", tags=["tweets_commons"]) + +# get all the stopwords +with open("stopwords.txt") as fp: + STOP_WORDS = set(fp.read().splitlines()) + +# Import all routes +from . import routes # noqa diff --git a/server/app/tweets_common/helper_functions.py b/server/app/tweets_common/helper_functions.py index 5b309fad..e8933d63 100644 --- a/server/app/tweets_common/helper_functions.py +++ b/server/app/tweets_common/helper_functions.py @@ -1,5 +1,5 @@ from datetime import date -from typing import Callable, List, Optional, Tuple, TypeVar +from typing import Callable, Collection, List, Optional, Tuple, TypeVar from fastapi import HTTPException from pydantic import PositiveInt @@ -14,11 +14,11 @@ def get_filtered_selection( - topics: Optional[List[Topics]], - day: Optional[date], - month: Optional[Month], + topics: Optional[Collection[Topics]], Model: ModelType, - fields: Optional[List[str]] = None, + day: Optional[date] = None, + month: Optional[Month] = None, + fields: Optional[Collection[str]] = None, ): """ Get selection query with filter depending upon topics provided @@ -99,7 +99,7 @@ def assert_not_null(tweet: Optional[ModelType], id: PositiveInt, Model: ModelTyp def get_scalar_select( - Model: ModelType, fields: Optional[List[str]] = None + Model: ModelType, fields: Optional[Collection[str]] = None ) -> Select[tuple]: """ Get a select statement for the Model with others column diff --git a/server/app/tweets_common/routes.py b/server/app/tweets_common/routes.py new file mode 100644 index 00000000..5d2ef214 --- /dev/null +++ b/server/app/tweets_common/routes.py @@ -0,0 +1,48 @@ +from datetime import date +from typing import List, Optional + +from fastapi import Depends, Query +from nltk import FreqDist +from sqlmodel import Session, select, union_all + +from ..database import get_session +from . import router +from .helper_functions import get_filtered_selection +from .models import PseudoTweet, Topics, Tweet +from .types import Month +from .word_cloud_helper import word_tokenize_nepali + + +@router.get("/") +def get_word_cloud( + topics: Optional[List[Topics]] = Query(None), + day: Optional[date] = None, + month: Optional[Month] = None, + session: Session = Depends(get_session), +): + """ + Get the word-count distribution within the given time range + """ + fields = ("text",) + + tweet_selection = get_filtered_selection(topics, Tweet, day, month, fields) + pseudo_tweet_selection = get_filtered_selection( + topics, PseudoTweet, day, month, fields + ) + + combined_model = union_all(tweet_selection, pseudo_tweet_selection).subquery().c + + # Manually selected the text here, need to change if needed + combined_tweets = session.exec(select(combined_model.text)).all() + + # It is a generator of tuples + two_dimensional_tokens = map(word_tokenize_nepali, combined_tweets) + + flat_tokens: List[str] = [] + + for token in two_dimensional_tokens: + flat_tokens.extend(token) + + word_freq = FreqDist(flat_tokens) + + return word_freq.most_common(100) diff --git a/server/app/tweets_common/word_cloud_helper.py b/server/app/tweets_common/word_cloud_helper.py new file mode 100644 index 00000000..9e366c5f --- /dev/null +++ b/server/app/tweets_common/word_cloud_helper.py @@ -0,0 +1,47 @@ +import re +from typing import AnyStr + +from nltk.tokenize import word_tokenize + +from . import STOP_WORDS + +emoj_regex = re.compile( + "[" + "\U0001F600-\U0001F64F" # emoticons + "\U0001F300-\U0001F5FF" # symbols & pictographs + "\U0001F680-\U0001F6FF" # transport & map symbols + "\U0001F1E0-\U0001F1FF" # flags (iOS) + "\U00002500-\U00002BEF" # chinese char + "\U00002702-\U000027B0" + "\U00002702-\U000027B0" + "\U000024C2-\U0001F251" + "\U0001f926-\U0001f937" + "\U00010000-\U0010ffff" + "\u2640-\u2642" + "\u2600-\u2B55" + "\u200d" + "\u23cf" + "\u23e9" + "\u231a" + "\ufe0f" # dingbats + "\u3030" + "]+", + re.UNICODE, +) + + +def remove_emojis(data: AnyStr) -> AnyStr: + return re.sub(emoj_regex, "", data) + + +def word_tokenize_nepali(text: str): + text = remove_emojis(text) + text = re.sub(r"\d+", " ", text) # remove any digits + text = re.sub(r"[,)({}[\]\.:;`_–\-``!‘’''“”?\-।/—%\|]+", " ", text) + text = re.sub( + r"\s+", " ", text + ) # replace multiple whitespaces with single whitespace + text = text.replace("#", "").replace( + "_", " " + ) # remove #, and break words containing underscore + return tuple(token for token in word_tokenize(text) if token not in STOP_WORDS) diff --git a/server/requirements.txt b/server/requirements.txt index 938538f4..fe8011c5 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -16,9 +16,7 @@ idna==3.3 iniconfig==1.1.1 joblib==1.1.0 nltk==3.7 -numpy==1.22.3 packaging==21.3 -pandas==1.4.2 passlib==1.7.4 pluggy==1.0.0 py==1.11.0 @@ -27,10 +25,8 @@ pydantic==1.9.0 PyJWT==2.3.0 pyparsing==3.0.8 pytest==7.1.1 -python-dateutil==2.8.2 python-dotenv==0.20.0 python-multipart==0.0.5 -pytz==2022.1 regex==2022.3.15 requests==2.27.1 six==1.16.0 diff --git a/server/stopwords.txt b/server/stopwords.txt new file mode 100644 index 00000000..b24f0388 --- /dev/null +++ b/server/stopwords.txt @@ -0,0 +1,492 @@ +अक्सर +अगाडि +अगाडी +अघि +अझै +अठार +अथवा +अनि +अनुसार +अन्तर्गत +अन्य +अन्यत्र +अन्यथा +अब +अरु +अरुलाई +अरू +अर्को +अर्थात +अर्थात् +अलग +अलि +अवस्था +अहिले +आ +आए +आएका +आएको +आज +आजको +आठ +आत्म +आदि +आदिलाई +आफनो +आफू +आफूलाई +आफै +आफैँ +आफ्नै +आफ्नो +आयो +उ +उक्त +उदाहरण +उनको +उनलाई +उनले +उनि +उनी +उनीहरुको +उन्नाइस +उप +उसको +उसलाई +उसले +उहालाई +ऊ +एउटा +एउटै +एक +एकदम +एघार +ओठ +औ +औं +कता +कति +कतै +कम +कमसेकम +कसरि +कसरी +कसै +कसैको +कसैलाई +कसैले +कसैसँग +कस्तो +कहाँबाट +कहिलेकाहीं +का +काम +कारण +कि +किन +किनभने +कुन +कुनै +कुन्नी +कुरा +कृपया +के +केहि +केही +को +कोहि +कोहिपनि +कोही +कोहीपनि +क्रमशः +गए +गएको +गएर +गयौ +गरि +गरी +गरे +गरेका +गरेको +गरेर +गरौं +गर्छ +गर्छन् +गर्छु +गर्दा +गर्दै +गर्न +गर्नु +गर्नुपर्छ +गर्ने +गैर +घर +चार +चाले +चाहनुहुन्छ +चाहन्छु +चाहिं +चाहिए +चाहिंले +चाहीं +चाहेको +चाहेर +चोटी +चौथो +चौध +छ +छन +छन् +छु +छू +छैन +छैनन् +छौ +छौं +जता +जताततै +जना +जनाको +जनालाई +जनाले +जब +जबकि +जबकी +जसको +जसबाट +जसमा +जसरी +जसलाई +जसले +जस्ता +जस्तै +जस्तो +जस्तोसुकै +जहाँ +जान +जाने +जाहिर +जुन +जुनै +जे +जो +जोपनि +जोपनी +झैं +ठाउँमा +ठीक +ठूलो +त +तता +तत्काल +तथा +तथापि +तथापी +तदनुसार +तपाइ +तपाई +तपाईको +तब +तर +तर्फ +तल +तसरी +तापनि +तापनी +तिन +तिनि +तिनिहरुलाई +तिनी +तिनीहरु +तिनीहरुको +तिनीहरू +तिनीहरूको +तिनै +तिमी +तिर +तिरको +ती +तीन +तुरन्त +तुरुन्त +तुरुन्तै +तेश्रो +तेस्कारण +तेस्रो +तेह्र +तैपनि +तैपनी +त्यत्तिकै +त्यत्तिकैमा +त्यस +त्यसकारण +त्यसको +त्यसले +त्यसैले +त्यसो +त्यस्तै +त्यस्तो +त्यहाँ +त्यहिँ +त्यही +त्यहीँ +त्यहीं +त्यो +त्सपछि +त्सैले +थप +थरि +थरी +थाहा +थिए +थिएँ +थिएन +थियो +दर्ता +दश +दिए +दिएको +दिन +दिनुभएको +दिनुहुन्छ +दुइ +दुइवटा +दुई +देखि +देखिन्छ +देखियो +देखे +देखेको +देखेर +दोश्री +दोश्रो +दोस्रो +द्वारा +धन्न +धेरै +धौ +न +नगर्नु +नगर्नू +नजिकै +नत्र +नत्रभने +नभई +नभएको +नभनेर +नयाँ +नि +निकै +निम्ति +निम्न +निम्नानुसार +निर्दिष्ट +नै +नौ +पक्का +पक्कै +पछाडि +पछाडी +पछि +पछिल्लो +पछी +पटक +पनि +पन्ध्र +पर्छ +पर्थ्यो +पर्दैन +पर्ने +पर्नेमा +पर्याप्त +पहिले +पहिलो +पहिल्यै +पाँच +पांच +पाचौँ +पाँचौं +पिच्छे +पूर्व +पो +प्रति +प्रतेक +प्रत्यक +प्राय +प्लस +फरक +फेरि +फेरी +बढी +बताए +बने +बरु +बाट +बारे +बाहिर +बाहेक +बाह्र +बिच +बिचमा +बिरुद्ध +बिशेष +बिस +बीच +बीचमा +बीस +भए +भएँ +भएका +भएकालाई +भएको +भएन +भएर +भन +भने +भनेको +भनेर +भन् +भन्छन् +भन्छु +भन्दा +भन्दै +भन्नुभयो +भन्ने +भन्या +भयेन +भयो +भर +भरि +भरी +भा +भित्र +भित्री +भीत्र +म +मध्य +मध्ये +मलाई +मा +मात्र +मात्रै +माथि +माथी +मुख्य +मुनि +मुन्तिर +मेरो +मैले +यति +यथोचित +यदि +यद्ध्यपि +यद्यपि +यस +यसका +यसको +यसपछि +यसबाहेक +यसमा +यसरी +यसले +यसो +यस्तै +यस्तो +यहाँ +यहाँसम्म +यही +या +यी +यो +र +रही +रहेका +रहेको +रहेछ +राखे +राख्छ +राम्रो +रुपमा +रूप +रे +लगभग +लगायत +लाई +लाख +लागि +लागेको +ले +वटा +वरीपरी +वा +वाट +वापत +वास्तवमा +शायद +स +सक्छ +सक्ने +सँग +संग +सँगको +सँगसँगै +सँगै +संगै +सङ्ग +सङ्गको +सट्टा +सत्र +सधै +सबै +सबैको +सबैलाई +समय +समेत +सम्भव +सम्म +सय +सरह +सहित +सहितै +सही +साँच्चै +सात +साथ +साथै +सायद +सारा +सुनेको +सुनेर +सुरु +सुरुको +सुरुमै +सो +सोचेको +सोचेर +सोही +सोह्र +स्थित +स्पष्ट +हजार +हरे +हरेक +हामी +हामीले +हाम्रा +हाम्रो +हुँदैन +हुन +हुनत +हुनु +हुने +हुनेछ +हुन् +हुन्छ +हुन्थ्यो +हैन +हो +होइन +होकि +होला +`` +'' \ No newline at end of file