sudo apt install curl mlocate git scala -y
curl -O https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
sudo tar xvf spark-3.2.0-bin-hadoop3.2.tgz
sudo mkdir /opt/spark
sudo mv spark-3.2.0-bin-hadoop3.2/* /opt/spark
$ sudo chmod -R 777 /opt/spark
sudo nano ~/.bashrc
export SPARK_HOME=/opt/spark
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
source ~/.bashrc
start-master.sh
/opt/spark/bin/spark-shell
echo "hello world hello spark hadoop hadoop mapreduce spark" > ~/input.txt
val input = sc.textFile("file:///home/ailab/input.txt")
val words = input.flatMap(line => line.split("\\s+"))
val wordPairs = words.map(word => (word, 1))
val wordCounts = wordPairs.reduceByKey(_ + _)
wordCounts.collect().foreach(println)
pip install pyspark
pyspark
If Spark opens successfully, installation is good.
sample.txt
hello world
hello spark
apache spark is fast
wordcount.py
from pyspark import SparkContext
sc = SparkContext("local", "WordCount")
# Load the text file
text_file = sc.textFile("sample.txt")
# WordCount logic
counts = (
text_file.flatMap(lambda line: line.split(" "))
.map(lambda word: (word, 1))
.reduceByKey(lambda a, b: a + b)
)
# Save or display result
counts.saveAsTextFile("output")
# counts.foreach(print) # To print instead of saving
sc.stop()
python3 wordcount.py
cat output/part-*