diff --git a/p4/.classpath b/p4/.classpath new file mode 100644 index 0000000..39abf1c --- /dev/null +++ b/p4/.classpath @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/p4/.project b/p4/.project new file mode 100644 index 0000000..fc4eb90 --- /dev/null +++ b/p4/.project @@ -0,0 +1,23 @@ + + + p4 + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/p4/.settings/org.eclipse.jdt.apt.core.prefs b/p4/.settings/org.eclipse.jdt.apt.core.prefs new file mode 100644 index 0000000..d4313d4 --- /dev/null +++ b/p4/.settings/org.eclipse.jdt.apt.core.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.apt.aptEnabled=false diff --git a/p4/.settings/org.eclipse.jdt.core.prefs b/p4/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..1b6e1ef --- /dev/null +++ b/p4/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,9 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 +org.eclipse.jdt.core.compiler.compliance=1.8 +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore +org.eclipse.jdt.core.compiler.processAnnotations=disabled +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=1.8 diff --git a/p4/.settings/org.eclipse.m2e.core.prefs b/p4/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000..f897a7f --- /dev/null +++ b/p4/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/p4/Makefile b/p4/Makefile new file mode 100644 index 0000000..1035657 --- /dev/null +++ b/p4/Makefile @@ -0,0 +1,15 @@ +.PHONY=clean build submit + +all: package submit + +clean: + mvn clean + +build: + mvn package + +submit: + spark-submit --class cs448.App target/p4-1.0-SNAPSHOT.jar -i "/user/mocull/input" + +warmup: + spark-submit --class cs448.App target/p4-1.0-SNAPSHOT.jar -i "/user/mocull/input" -warmup diff --git a/p4/src/main/java/cs448/Project4.java b/p4/src/main/java/cs448/Project4.java index 3b46cc1..04853fe 100644 --- a/p4/src/main/java/cs448/Project4.java +++ b/p4/src/main/java/cs448/Project4.java @@ -1,5 +1,25 @@ package cs448; +import org.apache.commons.cli.*; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.Optional; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; +import scala.Tuple4; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + public class Project4 { public void runSparkApp1(App.Conf conf){ System.out.println("Running Your First Spark App!"); @@ -8,10 +28,37 @@ public class Project4 { */ // Create a Spark Session. + SparkSession spark = SparkSession.builder().appName("CS 448 Project 4 -- Query 1").getOrCreate(); // Write data processing code here + String dataFiles[] = {conf.usersFName, conf.moviesFName, conf.ratingsFName}; + Dataset data; + + //// Reading, Parsing and counting lines for each of the data files + JavaRDD userRDD = spark.read().textFile(CS448Utils.resolveUri(conf.inPath,conf.usersFName)).cache() + .javaRDD() + .map(User::parseUser); + Dataset userDF = spark.createDataFrame(userRDD, User.class); + userDF.createOrReplaceTempView("User"); + + JavaRDD movieRDD = spark.read().textFile(CS448Utils.resolveUri(conf.inPath,conf.moviesFName)).cache() + .javaRDD() + .map(Movie::parseMovie); + Dataset movieDF = spark.createDataFrame(movieRDD, Movie.class); + movieDF.createOrReplaceTempView("Movie"); + + JavaRDD ratingRDD = spark.read().textFile(CS448Utils.resolveUri(conf.inPath,conf.ratingsFName)).cache() + .javaRDD() + .map(Rating::parseRating); + Dataset ratingDF = spark.createDataFrame(ratingRDD, Rating.class); + ratingDF.createOrReplaceTempView("Rating"); + + Dataset resultDF = spark.sql("SELECT DISTINCT m.title FROM Movie m, Rating r, User u WHERE m.movieId = r.movieId AND r.userId = u.userId AND u.occupation = " + + conf.q1Occupation + " AND r.rating >= " + conf.q1Rating); + resultDF.show(); //Don't forget to stop spark session + spark.stop(); } public void runSparkApp2(App.Conf conf){ diff --git a/p4/target/maven-archiver/pom.properties b/p4/target/maven-archiver/pom.properties new file mode 100644 index 0000000..03873ea --- /dev/null +++ b/p4/target/maven-archiver/pom.properties @@ -0,0 +1,4 @@ +#Created by Apache Maven 3.6.0 +version=1.0-SNAPSHOT +groupId=cs448 +artifactId=p4 diff --git a/p4/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/p4/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 0000000..b95557e --- /dev/null +++ b/p4/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst @@ -0,0 +1,7 @@ +cs448/User.class +cs448/Rating.class +cs448/App$Conf.class +cs448/App.class +cs448/Project4.class +cs448/CS448Utils.class +cs448/Movie.class diff --git a/p4/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/p4/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 0000000..b8af298 --- /dev/null +++ b/p4/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1,6 @@ +/home/max/src/cs448/p4/src/main/java/cs448/Rating.java +/home/max/src/cs448/p4/src/main/java/cs448/Movie.java +/home/max/src/cs448/p4/src/main/java/cs448/CS448Utils.java +/home/max/src/cs448/p4/src/main/java/cs448/User.java +/home/max/src/cs448/p4/src/main/java/cs448/App.java +/home/max/src/cs448/p4/src/main/java/cs448/Project4.java diff --git a/p4/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst b/p4/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst new file mode 100644 index 0000000..e69de29 diff --git a/p4/target/p4-1.0-SNAPSHOT.jar b/p4/target/p4-1.0-SNAPSHOT.jar new file mode 100644 index 0000000..4b7e749 Binary files /dev/null and b/p4/target/p4-1.0-SNAPSHOT.jar differ