Discussion:
mahout git commit: NOJIRA Fix LastFM CCO Row Cardinality Bug closes apache/mahout#351
r***@apache.org
2017-11-28 21:37:56 UTC
Permalink
Repository: mahout
Updated Branches:
refs/heads/master 1d198100a -> defbbd20f


NOJIRA Fix LastFM CCO Row Cardinality Bug closes apache/mahout#351


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/defbbd20
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/defbbd20
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/defbbd20

Branch: refs/heads/master
Commit: defbbd20f78c7b9e0bcc3a81d3d79d76be32cf23
Parents: 1d19810
Author: Trevor a.k.a @rawkintrevo <***@gmail.com>
Authored: Tue Nov 28 15:37:29 2017 -0600
Committer: Trevor a.k.a @rawkintrevo <***@gmail.com>
Committed: Tue Nov 28 15:37:29 2017 -0600

----------------------------------------------------------------------
.../docs/tutorials/cco-lastfm/cco-lastfm.scala | 33 ++++++++++++++++++--
1 file changed, 31 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/defbbd20/website/docs/tutorials/cco-lastfm/cco-lastfm.scala
----------------------------------------------------------------------
diff --git a/website/docs/tutorials/cco-lastfm/cco-lastfm.scala b/website/docs/tutorials/cco-lastfm/cco-lastfm.scala
index 6ba46a9..709ab2a 100644
--- a/website/docs/tutorials/cco-lastfm/cco-lastfm.scala
+++ b/website/docs/tutorials/cco-lastfm/cco-lastfm.scala
@@ -32,10 +32,39 @@ val userArtistsIDS = IndexedDatasetSpark.apply(userArtistsRDD)(sc)
val userFriendsRDD = sc.textFile("/path/to/data/lastfm/user_friends.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
val userFriendsIDS = IndexedDatasetSpark.apply(userFriendsRDD)(sc)

-import org.apache.mahout.math.cf.SimilarityAnalysis
+val primaryIDS = userFriendsIDS
+val secondaryActionRDDs = List(userArtistsRDD, userTagsRDD)
+
+import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary}
+
+def adjustRowCardinality(rowCardinality: Integer, datasetA: IndexedDataset): IndexedDataset = {
+ val returnedA = if (rowCardinality != datasetA.matrix.nrow) datasetA.newRowCardinality(rowCardinality)
+ else datasetA // this guarantees matching cardinality
+
+ returnedA
+}
+
+var rowCardinality = primaryIDS.rowIDs.size

-val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs(Array(userArtistsIDS, userTagsIDS, userFriendsIDS), maxInterestingItemsPerThing = 20, maxNumInteractions = 500, randomSeed = 1234)
+val secondaryActionIDS: Array[IndexedDataset] = new Array[IndexedDataset](secondaryActionRDDs.length)
+for (i <- secondaryActionRDDs.indices) {
+
+ val bcPrimaryRowIDs = sc.broadcast(primaryIDS.rowIDs)
+ bcPrimaryRowIDs.value
+
+ val tempRDD = secondaryActionRDDs(i).filter(a => bcPrimaryRowIDs.value.contains(a._1))
+
+ var tempIDS = IndexedDatasetSpark.apply(tempRDD, existingRowIDs = Some(primaryIDS.rowIDs))(sc)
+ secondaryActionIDS(i) = adjustRowCardinality(rowCardinality,tempIDS)
+}
+
+import org.apache.mahout.math.cf.SimilarityAnalysis

+val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs(
+ Array(primaryIDS, secondaryActionIDS(0), secondaryActionIDS(1)),
+ maxInterestingItemsPerThing = 20,
+ maxNumInteractions = 500,
+ randomSeed = 1234)
// Anonymous User

val artistMap = sc.textFile("/path/to/lastfm/artists.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "name").collect.toMap
Loading...