Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trabalho - Raquel Vicente Picanço #1

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
imdb*

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
54 changes: 54 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
asttokens==2.4.1
Bottleneck @ file:///C:/b/abs_f7un855idq/croot/bottleneck_1709069969633/work
Brotli @ file:///C:/b/abs_3d36mno480/croot/brotli-split_1714483178642/work
certifi @ file:///C:/b/abs_35d7n66oz9/croot/certifi_1707229248467/work/certifi
charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
colorama==0.4.6
comm==0.2.2
debugpy==1.8.1
decorator==5.1.1
executing==2.0.1
idna @ file:///C:/b/abs_aad84bnnw5/croot/idna_1714398896795/work
ipykernel==6.29.4
ipython==8.24.0
jedi==0.19.1
jupyter_client==8.6.1
jupyter_core==5.7.2
matplotlib-inline==0.1.7
mkl-fft @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/mkl_fft_1699473528480/work
mkl-random @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/mkl_random_1699473588250/work
mkl-service==2.4.0
nest-asyncio==1.6.0
numexpr @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/numexpr_1699503421264/work
numpy @ file:///C:/b/abs_c1ywpu18ar/croot/numpy_and_numpy_base_1708638681471/work/dist/numpy-1.26.4-cp312-cp312-win_amd64.whl#sha256=becc06674317799ad0165a939a7613809d0bee9bd328a1e4308c57c39cacf08c
packaging==24.0
pandas @ file:///C:/b/abs_3awk0iw2ab/croot/pandas_1709590545218/work/dist/pandas-2.2.1-cp312-cp312-win_amd64.whl#sha256=8f9837b9f672189b7e2df7dcf64b91243a78b0fad6e1125220b33a39b5c9e598
parso==0.8.4
platformdirs==4.2.2
prompt-toolkit==3.0.43
psutil==5.9.8
pure-eval==0.2.2
py4j @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/py4j_1699562107675/work
pyarrow @ file:///C:/b/abs_93i_y2dub4/croot/pyarrow_1707330894046/work/python
Pygments==2.18.0
PySocks @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/pysocks_1699473336188/work
pyspark @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/pyspark_1701815041952/work
python-dateutil==2.9.0.post0
pytz @ file:///C:/b/abs_6ap4tsz1ox/croot/pytz_1713974360290/work
pywin32==306
pyzmq==26.0.3
requests @ file:///C:/b/abs_474vaa3x9e/croot/requests_1707355619957/work
rm==2020.12.3
setuptools==69.5.1
six==1.16.0
stack-data==0.6.3
tornado==6.4
traitlets==5.14.3
tzdata @ file:///croot/python-tzdata_1690578112552/work
unzip==1.0.0
urllib3 @ file:///C:/b/abs_8e4z8_gh1l/croot/urllib3_1715636317140/work
values==2020.12.3
wcwidth==0.2.13
wget==3.2
wheel==0.43.0
win-inet-pton @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/win_inet_pton_1699472992992/work
211 changes: 153 additions & 58 deletions trabalho_big_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,43 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!wget https://raw.githubusercontent.com/N-CPUninter/Big_Data/main/data/imdb-reviews-pt-br.zip -O imdb-reviews-pt-br.zip\n",
"!unzip imdb-reviews-pt-br.zip\n",
"!rm imdb-reviews-pt-br.zip"
]
},
{
"cell_type": "markdown",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded imdb-reviews-pt-br.zip successfully!\n",
"Extracted files from imdb-reviews-pt-br.zip\n",
"Removed imdb-reviews-pt-br.zip\n"
]
}
],
"source": [
"## Instalação manual das dependências para uso do pyspark no Google Colab"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install pyspark"
"import os\n",
"import requests\n",
"import zipfile\n",
"\n",
"url = \"https://raw.githubusercontent.com/N-CPUninter/Big_Data/main/data/imdb-reviews-pt-br.zip\"\n",
"filename = \"imdb-reviews-pt-br.zip\"\n",
"\n",
"response = requests.get(url, stream=True)\n",
"\n",
"if response.status_code == 200:\n",
" with open(filename, \"wb\") as f:\n",
" for chunk in response.iter_content(1024):\n",
" f.write(chunk)\n",
" print(f\"Downloaded {filename} successfully!\")\n",
"else:\n",
" print(f\"Error downloading file: {response.status_code}\")\n",
"\n",
"with zipfile.ZipFile(filename, 'r') as zip_ref:\n",
" zip_ref.extractall()\n",
" print(\"Extracted files from\", filename)\n",
"\n",
"os.remove(filename)\n",
"print(f\"Removed {filename}\")"
]
},
{
Expand All @@ -54,16 +68,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iniciando spark session para Raquel - RU 3803786\n",
"Sessão iniciado com sucesso! 🚀\n"
]
}
],
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"appName = \"PySpark Trabalho de Big Data\"\n",
"MEU_RU = \"3803786\"\n",
"appName = f\"PySpark Trabalho de Big Data - {MEU_RU}\"\n",
"master = \"local\"\n",
"\n",
"spark = SparkSession.builder.appName(appName).master(master).getOrCreate()"
"print(f\"Iniciando spark session para Raquel - RU {MEU_RU}\")\n",
"spark: SparkSession = SparkSession.builder.appName(appName).master(master).getOrCreate()\n",
"print(\"Sessão iniciado com sucesso! 🚀\")"
]
},
{
Expand All @@ -75,14 +101,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"imdb_df = spark.read.csv('imdb-reviews-pt-br.csv', \n",
" header=True, \n",
" quote=\"\\\"\", \n",
" escape=\"\\\"\", \n",
"from pyspark.sql import DataFrame\n",
"from pyspark.sql.types import StructType, StructField, StringType\n",
"\n",
"schema = StructType([\n",
" StructField(\"id\", StringType(), True),\n",
" StructField(\"text_en\", StringType(), True),\n",
" StructField(\"text_pt\", StringType(), True),\n",
" StructField(\"sentiment\", StringType(), True),\n",
"])\n",
"\n",
"\n",
"imdb_df: DataFrame = spark.read.csv('imdb-reviews-pt-br.csv',\n",
" header=True,\n",
" quote=\"\\\"\",\n",
" escape=\"\\\"\",\n",
" encoding=\"UTF-8\")"
]
},
Expand All @@ -103,14 +140,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def map1(x):\n",
" # Coloque aqui o seu código para retornar a tupla necessária.\n",
" # Apague a linha abaixo para iniciar seu código.\n",
" pass"
"from pyspark.sql import DataFrame\n",
"\n",
"def filter_negative_reviews(data: DataFrame) -> DataFrame:\n",
" MEU_RU = \"3803786\"\n",
" print(f\"Meu RU é {MEU_RU}\")\n",
"\n",
" return data.filter(data[\"sentiment\"] == \"neg\")"
]
},
{
Expand All @@ -124,14 +164,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def reduceByKey1(x,y):\n",
" # Coloque aqui o seu código para retornar o resultado necessário.\n",
" # Apague a linha abaixo para iniciar seu código.\n",
" pass"
"from pyspark.sql import DataFrame\n",
"from pyspark.sql.functions import col, sum\n",
"\n",
"def sum_negative_ids(reviews: DataFrame) -> DataFrame:\n",
" MEU_RU = \"3803786\"\n",
" print(f\"Lembrando que meu RU é {MEU_RU}\")\n",
"\n",
" return reviews.withColumn(\"id\", col(\"id\").cast(\"int\")).select(sum(\"id\"))"
]
},
{
Expand All @@ -143,12 +187,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Meu RU é 3803786\n",
"Lembrando que meu RU é 3803786\n",
"Soma de IDs das reviews negativas: 459568555\n"
]
}
],
"source": [
"# Coloque aqui a sua linha de código para aplicar o map/reduce no seu \n",
"# dataframe spark e realize o collect() ao final para visualizar os dados.\n"
"negative_reviews = filter_negative_reviews(imdb_df)\n",
"sum_of_negative_ids = sum_negative_ids(negative_reviews).collect()[0][0]\n",
"\n",
"print(f\"Soma de IDs das reviews negativas: {sum_of_negative_ids}\")"
]
},
{
Expand All @@ -168,14 +224,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def map2(x):\n",
" # Coloque aqui o seu código para retornar a tupla necessária.\n",
" # Apague a linha abaixo para iniciar seu código.\n",
" pass"
"from pyspark.sql import DataFrame\n",
"from pyspark.sql.functions import col, split, size\n",
"\n",
"def map_sentiment_to_word_count(data: DataFrame) -> DataFrame:\n",
" MEU_RU = \"3803786\"\n",
" print(f\"Oi! Sou Raquel e meu RU é {MEU_RU}\")\n",
"\n",
" return data.select(col(\"sentiment\"), size(split(col(\"text_en\"), \"\\\\s+\")).alias(\"text_en_word_count\"), size(split(col(\"text_pt\"), \"\\\\s+\")).alias(\"text_pt_word_count\"))"
]
},
{
Expand All @@ -189,14 +249,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def reduceByKey2(x,y):\n",
" # Coloque aqui o seu código para retornar o resultado necessário.\n",
" # Apague a linha abaixo para iniciar seu código.\n",
" pass"
"from pyspark.sql import DataFrame\n",
"\n",
"def reduce_word_count_by_sentiment(sentiment_word_counts: DataFrame) -> DataFrame:\n",
" MEU_RU = \"3803786\"\n",
" print(f\"Já falei que meu RU é {MEU_RU}?\")\n",
"\n",
" return sentiment_word_counts.groupBy(\"sentiment\").agg(sum(\"text_en_word_count\").alias(\"total_text_en_words\"), sum(\"text_pt_word_count\").alias(\"total_text_pt_words\"))\n"
]
},
{
Expand All @@ -212,17 +275,49 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Meu RU é 3803786\n",
"Oi! Sou Raquel e meu RU é 3803786\n",
"Já falei que meu RU é 3803786?\n",
"Diferença entre a contagem de palavras: 54976 (Texto em PT - Texto em EN)\n"
]
}
],
"source": [
"# Coloque aqui suas linhas de código final\n"
"negative_data = filter_negative_reviews(imdb_df)\n",
"\n",
"sentiment_word_counts = map_sentiment_to_word_count(negative_data)\n",
"total_word_counts = reduce_word_count_by_sentiment(sentiment_word_counts)\n",
"result = total_word_counts.collect()[0]\n",
"word_count_difference = result[2] - result[1]\n",
"\n",
"print(f\"Diferença entre a contagem de palavras: {word_count_difference} (Texto em PT - Texto em EN)\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
},
"orig_nbformat": 4
},
Expand Down