diff --git a/.gitignore b/.gitignore index 68bc17f..3a2797f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +imdb* + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..311f598 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,54 @@ +asttokens==2.4.1 +Bottleneck @ file:///C:/b/abs_f7un855idq/croot/bottleneck_1709069969633/work +Brotli @ file:///C:/b/abs_3d36mno480/croot/brotli-split_1714483178642/work +certifi @ file:///C:/b/abs_35d7n66oz9/croot/certifi_1707229248467/work/certifi +charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work +colorama==0.4.6 +comm==0.2.2 +debugpy==1.8.1 +decorator==5.1.1 +executing==2.0.1 +idna @ file:///C:/b/abs_aad84bnnw5/croot/idna_1714398896795/work +ipykernel==6.29.4 +ipython==8.24.0 +jedi==0.19.1 +jupyter_client==8.6.1 +jupyter_core==5.7.2 +matplotlib-inline==0.1.7 +mkl-fft @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/mkl_fft_1699473528480/work +mkl-random @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/mkl_random_1699473588250/work +mkl-service==2.4.0 +nest-asyncio==1.6.0 +numexpr @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/numexpr_1699503421264/work +numpy @ file:///C:/b/abs_c1ywpu18ar/croot/numpy_and_numpy_base_1708638681471/work/dist/numpy-1.26.4-cp312-cp312-win_amd64.whl#sha256=becc06674317799ad0165a939a7613809d0bee9bd328a1e4308c57c39cacf08c +packaging==24.0 +pandas @ file:///C:/b/abs_3awk0iw2ab/croot/pandas_1709590545218/work/dist/pandas-2.2.1-cp312-cp312-win_amd64.whl#sha256=8f9837b9f672189b7e2df7dcf64b91243a78b0fad6e1125220b33a39b5c9e598 +parso==0.8.4 +platformdirs==4.2.2 +prompt-toolkit==3.0.43 +psutil==5.9.8 +pure-eval==0.2.2 +py4j @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/py4j_1699562107675/work +pyarrow @ file:///C:/b/abs_93i_y2dub4/croot/pyarrow_1707330894046/work/python +Pygments==2.18.0 +PySocks @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/pysocks_1699473336188/work +pyspark @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/pyspark_1701815041952/work +python-dateutil==2.9.0.post0 +pytz @ file:///C:/b/abs_6ap4tsz1ox/croot/pytz_1713974360290/work +pywin32==306 +pyzmq==26.0.3 +requests @ file:///C:/b/abs_474vaa3x9e/croot/requests_1707355619957/work +rm==2020.12.3 +setuptools==69.5.1 +six==1.16.0 +stack-data==0.6.3 +tornado==6.4 +traitlets==5.14.3 +tzdata @ file:///croot/python-tzdata_1690578112552/work +unzip==1.0.0 +urllib3 @ file:///C:/b/abs_8e4z8_gh1l/croot/urllib3_1715636317140/work +values==2020.12.3 +wcwidth==0.2.13 +wget==3.2 +wheel==0.43.0 +win-inet-pton @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/win_inet_pton_1699472992992/work diff --git a/trabalho_big_data.ipynb b/trabalho_big_data.ipynb index f2a005c..5e50b8b 100644 --- a/trabalho_big_data.ipynb +++ b/trabalho_big_data.ipynb @@ -20,29 +20,43 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget https://raw.githubusercontent.com/N-CPUninter/Big_Data/main/data/imdb-reviews-pt-br.zip -O imdb-reviews-pt-br.zip\n", - "!unzip imdb-reviews-pt-br.zip\n", - "!rm imdb-reviews-pt-br.zip" - ] - }, - { - "cell_type": "markdown", + "execution_count": 1, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloaded imdb-reviews-pt-br.zip successfully!\n", + "Extracted files from imdb-reviews-pt-br.zip\n", + "Removed imdb-reviews-pt-br.zip\n" + ] + } + ], "source": [ - "## Instalação manual das dependências para uso do pyspark no Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install pyspark" + "import os\n", + "import requests\n", + "import zipfile\n", + "\n", + "url = \"https://raw.githubusercontent.com/N-CPUninter/Big_Data/main/data/imdb-reviews-pt-br.zip\"\n", + "filename = \"imdb-reviews-pt-br.zip\"\n", + "\n", + "response = requests.get(url, stream=True)\n", + "\n", + "if response.status_code == 200:\n", + " with open(filename, \"wb\") as f:\n", + " for chunk in response.iter_content(1024):\n", + " f.write(chunk)\n", + " print(f\"Downloaded {filename} successfully!\")\n", + "else:\n", + " print(f\"Error downloading file: {response.status_code}\")\n", + "\n", + "with zipfile.ZipFile(filename, 'r') as zip_ref:\n", + " zip_ref.extractall()\n", + " print(\"Extracted files from\", filename)\n", + "\n", + "os.remove(filename)\n", + "print(f\"Removed {filename}\")" ] }, { @@ -54,16 +68,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iniciando spark session para Raquel - RU 3803786\n", + "Sessão iniciado com sucesso! 🚀\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession\n", "\n", - "appName = \"PySpark Trabalho de Big Data\"\n", + "MEU_RU = \"3803786\"\n", + "appName = f\"PySpark Trabalho de Big Data - {MEU_RU}\"\n", "master = \"local\"\n", "\n", - "spark = SparkSession.builder.appName(appName).master(master).getOrCreate()" + "print(f\"Iniciando spark session para Raquel - RU {MEU_RU}\")\n", + "spark: SparkSession = SparkSession.builder.appName(appName).master(master).getOrCreate()\n", + "print(\"Sessão iniciado com sucesso! 🚀\")" ] }, { @@ -75,14 +101,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "imdb_df = spark.read.csv('imdb-reviews-pt-br.csv', \n", - " header=True, \n", - " quote=\"\\\"\", \n", - " escape=\"\\\"\", \n", + "from pyspark.sql import DataFrame\n", + "from pyspark.sql.types import StructType, StructField, StringType\n", + "\n", + "schema = StructType([\n", + " StructField(\"id\", StringType(), True),\n", + " StructField(\"text_en\", StringType(), True),\n", + " StructField(\"text_pt\", StringType(), True),\n", + " StructField(\"sentiment\", StringType(), True),\n", + "])\n", + "\n", + "\n", + "imdb_df: DataFrame = spark.read.csv('imdb-reviews-pt-br.csv',\n", + " header=True,\n", + " quote=\"\\\"\",\n", + " escape=\"\\\"\",\n", " encoding=\"UTF-8\")" ] }, @@ -103,14 +140,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "def map1(x):\n", - " # Coloque aqui o seu código para retornar a tupla necessária.\n", - " # Apague a linha abaixo para iniciar seu código.\n", - " pass" + "from pyspark.sql import DataFrame\n", + "\n", + "def filter_negative_reviews(data: DataFrame) -> DataFrame:\n", + " MEU_RU = \"3803786\"\n", + " print(f\"Meu RU é {MEU_RU}\")\n", + "\n", + " return data.filter(data[\"sentiment\"] == \"neg\")" ] }, { @@ -124,14 +164,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "def reduceByKey1(x,y):\n", - " # Coloque aqui o seu código para retornar o resultado necessário.\n", - " # Apague a linha abaixo para iniciar seu código.\n", - " pass" + "from pyspark.sql import DataFrame\n", + "from pyspark.sql.functions import col, sum\n", + "\n", + "def sum_negative_ids(reviews: DataFrame) -> DataFrame:\n", + " MEU_RU = \"3803786\"\n", + " print(f\"Lembrando que meu RU é {MEU_RU}\")\n", + "\n", + " return reviews.withColumn(\"id\", col(\"id\").cast(\"int\")).select(sum(\"id\"))" ] }, { @@ -143,12 +187,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Meu RU é 3803786\n", + "Lembrando que meu RU é 3803786\n", + "Soma de IDs das reviews negativas: 459568555\n" + ] + } + ], "source": [ - "# Coloque aqui a sua linha de código para aplicar o map/reduce no seu \n", - "# dataframe spark e realize o collect() ao final para visualizar os dados.\n" + "negative_reviews = filter_negative_reviews(imdb_df)\n", + "sum_of_negative_ids = sum_negative_ids(negative_reviews).collect()[0][0]\n", + "\n", + "print(f\"Soma de IDs das reviews negativas: {sum_of_negative_ids}\")" ] }, { @@ -168,14 +224,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "def map2(x):\n", - " # Coloque aqui o seu código para retornar a tupla necessária.\n", - " # Apague a linha abaixo para iniciar seu código.\n", - " pass" + "from pyspark.sql import DataFrame\n", + "from pyspark.sql.functions import col, split, size\n", + "\n", + "def map_sentiment_to_word_count(data: DataFrame) -> DataFrame:\n", + " MEU_RU = \"3803786\"\n", + " print(f\"Oi! Sou Raquel e meu RU é {MEU_RU}\")\n", + "\n", + " return data.select(col(\"sentiment\"), size(split(col(\"text_en\"), \"\\\\s+\")).alias(\"text_en_word_count\"), size(split(col(\"text_pt\"), \"\\\\s+\")).alias(\"text_pt_word_count\"))" ] }, { @@ -189,14 +249,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "def reduceByKey2(x,y):\n", - " # Coloque aqui o seu código para retornar o resultado necessário.\n", - " # Apague a linha abaixo para iniciar seu código.\n", - " pass" + "from pyspark.sql import DataFrame\n", + "\n", + "def reduce_word_count_by_sentiment(sentiment_word_counts: DataFrame) -> DataFrame:\n", + " MEU_RU = \"3803786\"\n", + " print(f\"Já falei que meu RU é {MEU_RU}?\")\n", + "\n", + " return sentiment_word_counts.groupBy(\"sentiment\").agg(sum(\"text_en_word_count\").alias(\"total_text_en_words\"), sum(\"text_pt_word_count\").alias(\"total_text_pt_words\"))\n" ] }, { @@ -212,17 +275,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Meu RU é 3803786\n", + "Oi! Sou Raquel e meu RU é 3803786\n", + "Já falei que meu RU é 3803786?\n", + "Diferença entre a contagem de palavras: 54976 (Texto em PT - Texto em EN)\n" + ] + } + ], "source": [ - "# Coloque aqui suas linhas de código final\n" + "negative_data = filter_negative_reviews(imdb_df)\n", + "\n", + "sentiment_word_counts = map_sentiment_to_word_count(negative_data)\n", + "total_word_counts = reduce_word_count_by_sentiment(sentiment_word_counts)\n", + "result = total_word_counts.collect()[0]\n", + "word_count_difference = result[2] - result[1]\n", + "\n", + "print(f\"Diferença entre a contagem de palavras: {word_count_difference} (Texto em PT - Texto em EN)\")\n" ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" }, "orig_nbformat": 4 },