[ https://issues.apache.org/jira/browse/MAHOUT-1853?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15431100#comment-15431100 ]
ASF GitHub Bot commented on MAHOUT-1853: ---------------------------------------- Github user dlyubimov commented on a diff in the pull request: https://github.com/apache/mahout/pull/251#discussion_r75708308 --- Diff: spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala --- @@ -191,14 +193,115 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed //cross similarity val matrixCrossCooc = drmCooc(1).checkpoint().collect - val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocBtANonSymmetric) + val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocAtBNonSymmetric) n = (new MatrixOps(m = diff2Matrix)).norm //cooccurrence without LLR is just a A'B //val inCoreAtB = a.transpose().times(b) //val bp = 0 } + test("Cross-occurrence two IndexedDatasets"){ + val a = dense( + (1, 1, 0, 0, 0), + (0, 0, 1, 1, 0), + (0, 0, 0, 0, 1), + (1, 0, 0, 1, 0)) + + val b = dense( + (0, 1, 1, 0), + (1, 1, 1, 0), + (0, 0, 1, 0), + (1, 1, 0, 1)) + + val users = Seq("u1", "u2", "u3", "u4") + val itemsA = Seq("a1", "a2", "a3", "a4", "a5") + val itemsB = Seq("b1", "b2", "b3", "b4") + val userDict = new BiDictionary(users) + val itemsADict = new BiDictionary(itemsA) + val itemsBDict = new BiDictionary(itemsB) + + // this is downsampled to the top 2 values per row to match the calc + val matrixLLRCoocAtBNonSymmetric = dense( + (0.0, 1.7260924347106847, 1.7260924347106847, 0.0), --- End diff -- this is our accepted convention, good > Improvements to CCO (Correlated Cross-Occurrence) > ------------------------------------------------- > > Key: MAHOUT-1853 > URL: https://issues.apache.org/jira/browse/MAHOUT-1853 > Project: Mahout > Issue Type: New Feature > Affects Versions: 0.12.0 > Reporter: Andrew Palumbo > Assignee: Pat Ferrel > Fix For: 0.13.0 > > > Improvements to CCO (Correlated Cross-Occurrence) to include auto-threshold > calculation for LLR downsampling, and possible multiple fixed thresholds for > A’A, A’B etc. This is to account for the vast difference in dimensionality > between indicator types. -- This message was sent by Atlassian JIRA (v6.3.4#6332)