[01/51] [partial] mahout git commit: MAHOUT-2042 and MAHOUT-2045 Delete directories which were moved/no longer in use

Discussion:

r***@apache.org

2018-06-27 14:51:29 UTC

Repository: mahout
Updated Branches:
refs/heads/branch-0.14.0 0908c521a -> e0573de33

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/solver/LSMR.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/solver/LSMR.java b/math/src/main/java/org/apache/mahout/math/solver/LSMR.java
deleted file mode 100644
index 1f3e706..0000000
--- a/math/src/main/java/org/apache/mahout/math/solver/LSMR.java
+++ /dev/null
@@ -1,565 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.solver;
-
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.Functions;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Solves sparse least-squares using the LSMR algorithm.
- * 
- * LSMR solves the system of linear equations A * X = B. If the system is inconsistent, it solves
- * the least-squares problem min ||b - Ax||_2. A is a rectangular matrix of dimension m-by-n, where
- * all cases are allowed: m=n, m>n, or m<n. B is a vector of length m. The matrix A may be dense
- * or sparse (usually sparse).
- * 
- * Some additional configurable properties adjust the behavior of the algorithm.
- * 
- * If you set lambda to a non-zero value then LSMR solves the regularized least-squares problem min
- * ||(B) - ( A )X|| ||(0) (lambda*I) ||_2 where LAMBDA is a scalar. If LAMBDA is not set,
- * the system is solved without regularization.
- * 
- * You can also set aTolerance and bTolerance. These cause LSMR to iterate until a certain backward
- * error estimate is smaller than some quantity depending on ATOL and BTOL. Let RES = B - A*X be
- * the residual vector for the current approximate solution X. If A*X = B seems to be consistent,
- * LSMR terminates when NORM(RES) <= ATOL*NORM(A)*NORM(X) + BTOL*NORM(B). Otherwise, LSMR terminates
- * when NORM(A'*RES) <= ATOL*NORM(A)*NORM(RES). If both tolerances are 1.0e-6 (say), the final
- * NORM(RES) should be accurate to about 6 digits. (The final X will usually have fewer correct
- * digits, depending on cond(A) and the size of LAMBDA.)
- * 
- * The default value for ATOL and BTOL is 1e-6.
- * 
- * Ideally, they should be estimates of the relative error in the entries of A and B respectively.
- * For example, if the entries of A have 7 correct digits, set ATOL = 1e-7. This prevents the
- * algorithm from doing unnecessary work beyond the uncertainty of the input data.
- * 
- * You can also set conditionLimit. In that case, LSMR terminates if an estimate of cond(A) exceeds
- * conditionLimit. For compatible systems Ax = b, conditionLimit could be as large as 1.0e+12 (say).
- * For least-squares problems, conditionLimit should be less than 1.0e+8. If conditionLimit is not
- * set, the default value is 1e+8. Maximum precision can be obtained by setting aTolerance =
- * bTolerance = conditionLimit = 0, but the number of iterations may then be excessive.
- * 
- * Setting iterationLimit causes LSMR to terminate if the number of iterations reaches
- * iterationLimit. The default is iterationLimit = min(m,n). For ill-conditioned systems, a
- * larger value of ITNLIM may be needed.
- * 
- * Setting localSize causes LSMR to run with rerorthogonalization on the last localSize v_k's.
- * (v-vectors generated by Golub-Kahan bidiagonalization) If localSize is not set, LSMR runs without
- * reorthogonalization. A localSize > max(n,m) performs reorthogonalization on all v_k's.
- * Reorthgonalizing only u_k or both u_k and v_k are not an option here. Details are discussed in
- * the SIAM paper.
- * 
- * getTerminationReason() gives the reason for termination. ISTOP = 0 means X=0 is a solution. = 1
- * means X is an approximate solution to A*X = B, according to ATOL and BTOL. = 2 means X
- * approximately solves the least-squares problem according to ATOL. = 3 means COND(A) seems to be
- * greater than CONLIM. = 4 is the same as 1 with ATOL = BTOL = EPS. = 5 is the same as 2 with ATOL
- * = EPS. = 6 is the same as 3 with CONLIM = 1/EPS. = 7 means ITN reached ITNLIM before the other
- * stopping conditions were satisfied.
- * 
- * getIterationCount() gives ITN = the number of LSMR iterations.
- * 
- * getResidualNorm() gives an estimate of the residual norm: NORMR = norm(B-A*X).
- * 
- * getNormalEquationResidual() gives an estimate of the residual for the normal equation: NORMAR =
- * NORM(A'*(B-A*X)).
- * 
- * getANorm() gives an estimate of the Frobenius norm of A.
- * 
- * getCondition() gives an estimate of the condition number of A.
- * 
- * getXNorm() gives an estimate of NORM(X).
- * 
- * LSMR uses an iterative method. For further information, see D. C.-L. Fong and M. A. Saunders
- * LSMR: An iterative algorithm for least-square problems Draft of 03 Apr 2010, to be submitted to
- * SISC.
- * 
- * David Chin-lung Fong ***@stanford.edu Institute for Computational and Mathematical
- * Engineering Stanford University
- * 
- * Michael Saunders ***@stanford.edu Systems Optimization Laboratory Dept of
- * MS&E, Stanford University. -----------------------------------------------------------------------
- */
-public final class LSMR {
-
- private static final Logger log = LoggerFactory.getLogger(LSMR.class);
-
- private final double lambda;
- private int localSize;
- private int iterationLimit;
- private double conditionLimit;
- private double bTolerance;
- private double aTolerance;
- private int localPointer;
- private Vector[] localV;
- private double residualNorm;
- private double normalEquationResidual;
- private double xNorm;
- private int iteration;
- private double normA;
- private double condA;
-
- public int getIterationCount() {
- return iteration;
- }
-
- public double getResidualNorm() {
- return residualNorm;
- }
-
- public double getNormalEquationResidual() {
- return normalEquationResidual;
- }
-
- public double getANorm() {
- return normA;
- }
-
- public double getCondition() {
- return condA;
- }
-
- public double getXNorm() {
- return xNorm;
- }
-
- /**
- * LSMR uses an iterative method to solve a linear system. For further information, see D. C.-L.
- * Fong and M. A. Saunders LSMR: An iterative algorithm for least-square problems Draft of 03 Apr
- * 2010, to be submitted to SISC.
- * 
- * 08 Dec 2009: First release version of LSMR. 09 Apr 2010: Updated documentation and default
- * parameters. 14 Apr 2010: Updated documentation. 03 Jun 2010: LSMR with local
- * reorthogonalization (full reorthogonalization is also implemented)
- * 
- * David Chin-lung Fong ***@stanford.edu Institute for Computational and
- * Mathematical Engineering Stanford University
- * 
- * Michael Saunders ***@stanford.edu Systems Optimization Laboratory Dept of
- * MS&E, Stanford University. -----------------------------------------------------------------------
- */
-
- public LSMR() {
- // Set default parameters.
- lambda = 0;
- aTolerance = 1.0e-6;
- bTolerance = 1.0e-6;
- conditionLimit = 1.0e8;
- iterationLimit = -1;
- localSize = 0;
- }
-
- public Vector solve(Matrix A, Vector b) {
- /*
- % Initialize.
-
-
- hdg1 = ' itn x(1) norm r norm A''r';
- hdg2 = ' compatible LS norm A cond A';
- pfreq = 20; % print frequency (for repeating the heading)
- pcount = 0; % print counter
-
- % Determine dimensions m and n, and
- % form the first vectors u and v.
- % These satisfy beta*u = b, alpha*v = A'u.
- */
- log.debug(" itn x(1) norm r norm A'r");
- log.debug(" compatible LS norm A cond A");
-
- Matrix transposedA = A.transpose();
- Vector u = b;
-
- double beta = u.norm(2);
- if (beta > 0) {
- u = u.divide(beta);
- }
-
- Vector v = transposedA.times(u);
- int m = A.numRows();
- int n = A.numCols();
-
- int minDim = Math.min(m, n);
- if (iterationLimit == -1) {
- iterationLimit = minDim;
- }
-
- if (log.isDebugEnabled()) {
- log.debug("LSMR - Least-squares solution of Ax = b, based on Matlab Version 1.02, 14 Apr 2010, "
- + "Mahout version {}", getClass().getPackage().getImplementationVersion());
- log.debug(String.format("The matrix A has %d rows and %d cols, lambda = %.4g, atol = %g, btol = %g",
- m, n, lambda, aTolerance, bTolerance));
- }
-
- double alpha = v.norm(2);
- if (alpha > 0) {
- v.assign(Functions.div(alpha));
- }
-
-
- // Initialization for local reorthogonalization
- localPointer = 0;
-
- // Preallocate storage for storing the last few v_k. Since with
- // orthogonal v_k's, Krylov subspace method would converge in not
- // more iterations than the number of singular values, more
- // space is not necessary.
- localV = new Vector[Math.min(localSize, minDim)];
- boolean localOrtho = false;
- if (localSize > 0) {
- localOrtho = true;
- localV[0] = v;
- }
-
-
- // Initialize variables for 1st iteration.
-
- iteration = 0;
- double zetabar = alpha * beta;
- double alphabar = alpha;
-
- Vector h = v;
- Vector hbar = zeros(n);
- Vector x = zeros(n);
-
- // Initialize variables for estimation of ||r||.
-
- double betadd = beta;
-
- // Initialize variables for estimation of ||A|| and cond(A)
-
- double aNorm = alpha * alpha;
-
- // Items for use in stopping rules.
- double normb = beta;
-
- double ctol = 0;
- if (conditionLimit > 0) {
- ctol = 1 / conditionLimit;
- }
- residualNorm = beta;
-
- // Exit if b=0 or A'b = 0.
-
- normalEquationResidual = alpha * beta;
- if (normalEquationResidual == 0) {
- return x;
- }
-
- // Heading for iteration log.
-
-
- if (log.isDebugEnabled()) {
- double test2 = alpha / beta;
-// log.debug('{} {}', hdg1, hdg2);
- log.debug("{} {}", iteration, x.get(0));
- log.debug("{} {}", residualNorm, normalEquationResidual);
- double test1 = 1;
- log.debug("{} {}", test1, test2);
- }
-
-
- //------------------------------------------------------------------
- // Main iteration loop.
- //------------------------------------------------------------------
- double rho = 1;
- double rhobar = 1;
- double cbar = 1;
- double sbar = 0;
- double betad = 0;
- double rhodold = 1;
- double tautildeold = 0;
- double thetatilde = 0;
- double zeta = 0;
- double d = 0;
- double maxrbar = 0;
- double minrbar = 1.0e+100;
- StopCode stop = StopCode.CONTINUE;
- while (iteration <= iterationLimit && stop == StopCode.CONTINUE) {
-
- iteration++;
-
- // Perform the next step of the bidiagonalization to obtain the
- // next beta, u, alpha, v. These satisfy the relations
- // beta*u = A*v - alpha*u,
- // alpha*v = A'*u - beta*v.
-
- u = A.times(v).minus(u.times(alpha));
- beta = u.norm(2);
- if (beta > 0) {
- u.assign(Functions.div(beta));
-
- // store data for local-reorthogonalization of V
- if (localOrtho) {
- localVEnqueue(v);
- }
- v = transposedA.times(u).minus(v.times(beta));
- // local-reorthogonalization of V
- if (localOrtho) {
- v = localVOrtho(v);
- }
- alpha = v.norm(2);
- if (alpha > 0) {
- v.assign(Functions.div(alpha));
- }
- }
-
- // At this point, beta = beta_{k+1}, alpha = alpha_{k+1}.
-
- // Construct rotation Qhat_{k,2k+1}.
-
- double alphahat = Math.hypot(alphabar, lambda);
- double chat = alphabar / alphahat;
- double shat = lambda / alphahat;
-
- // Use a plane rotation (Q_i) to turn B_i to R_i
-
- double rhoold = rho;
- rho = Math.hypot(alphahat, beta);
- double c = alphahat / rho;
- double s = beta / rho;
- double thetanew = s * alpha;
- alphabar = c * alpha;
-
- // Use a plane rotation (Qbar_i) to turn R_i^T to R_i^bar
-
- double rhobarold = rhobar;
- double zetaold = zeta;
- double thetabar = sbar * rho;
- double rhotemp = cbar * rho;
- rhobar = Math.hypot(cbar * rho, thetanew);
- cbar = cbar * rho / rhobar;
- sbar = thetanew / rhobar;
- zeta = cbar * zetabar;
- zetabar = -sbar * zetabar;
-
-
- // Update h, h_hat, x.
-
- hbar = h.minus(hbar.times(thetabar * rho / (rhoold * rhobarold)));
-
- x.assign(hbar.times(zeta / (rho * rhobar)), Functions.PLUS);
- h = v.minus(h.times(thetanew / rho));
-
- // Estimate of ||r||.
-
- // Apply rotation Qhat_{k,2k+1}.
- double betaacute = chat * betadd;
- double betacheck = -shat * betadd;
-
- // Apply rotation Q_{k,k+1}.
- double betahat = c * betaacute;
- betadd = -s * betaacute;
-
- // Apply rotation Qtilde_{k-1}.
- // betad = betad_{k-1} here.
-
- double thetatildeold = thetatilde;
- double rhotildeold = Math.hypot(rhodold, thetabar);
- double ctildeold = rhodold / rhotildeold;
- double stildeold = thetabar / rhotildeold;
- thetatilde = stildeold * rhobar;
- rhodold = ctildeold * rhobar;
- betad = -stildeold * betad + ctildeold * betahat;
-
- // betad = betad_k here.
- // rhodold = rhod_k here.
-
- tautildeold = (zetaold - thetatildeold * tautildeold) / rhotildeold;
- double taud = (zeta - thetatilde * tautildeold) / rhodold;
- d += betacheck * betacheck;
- residualNorm = Math.sqrt(d + (betad - taud) * (betad - taud) + betadd * betadd);
-
- // Estimate ||A||.
- aNorm += beta * beta;
- normA = Math.sqrt(aNorm);
- aNorm += alpha * alpha;
-
- // Estimate cond(A).
- maxrbar = Math.max(maxrbar, rhobarold);
- if (iteration > 1) {
- minrbar = Math.min(minrbar, rhobarold);
- }
- condA = Math.max(maxrbar, rhotemp) / Math.min(minrbar, rhotemp);
-
- // Test for convergence.
-
- // Compute norms for convergence testing.
- normalEquationResidual = Math.abs(zetabar);
- xNorm = x.norm(2);
-
- // Now use these norms to estimate certain other quantities,
- // some of which will be small near a solution.
-
- double test1 = residualNorm / normb;
- double test2 = normalEquationResidual / (normA * residualNorm);
- double test3 = 1 / condA;
- double t1 = test1 / (1 + normA * xNorm / normb);
- double rtol = bTolerance + aTolerance * normA * xNorm / normb;
-
- // The following tests guard against extremely small values of
- // atol, btol or ctol. (The user may have set any or all of
- // the parameters atol, btol, conlim to 0.)
- // The effect is equivalent to the normAl tests using
- // atol = eps, btol = eps, conlim = 1/eps.
-
- if (iteration > iterationLimit) {
- stop = StopCode.ITERATION_LIMIT;
- }
- if (1 + test3 <= 1) {
- stop = StopCode.CONDITION_MACHINE_TOLERANCE;
- }
- if (1 + test2 <= 1) {
- stop = StopCode.LEAST_SQUARE_CONVERGED_MACHINE_TOLERANCE;
- }
- if (1 + t1 <= 1) {
- stop = StopCode.CONVERGED_MACHINE_TOLERANCE;
- }
-
- // Allow for tolerances set by the user.
-
- if (test3 <= ctol) {
- stop = StopCode.CONDITION;
- }
- if (test2 <= aTolerance) {
- stop = StopCode.CONVERGED;
- }
- if (test1 <= rtol) {
- stop = StopCode.TRIVIAL;
- }
-
- // See if it is time to print something.
- if (log.isDebugEnabled()) {
- if ((n <= 40) || (iteration <= 10) || (iteration >= iterationLimit - 10) || ((iteration % 10) == 0)
- || (test3 <= 1.1 * ctol) || (test2 <= 1.1 * aTolerance) || (test1 <= 1.1 * rtol)
- || (stop != StopCode.CONTINUE)) {
- statusDump(x, normA, condA, test1, test2);
- }
- }
- } // iteration loop
-
- // Print the stopping condition.
- log.debug("Finished: {}", stop.getMessage());
-
- return x;
- /*
-
-
- if show
- fprintf('\n\nLSMR finished')
- fprintf('\n%s', msg(istop+1,:))
- fprintf('\nistop =%8g normr =%8.1e' , istop, normr )
- fprintf(' normA =%8.1e normAr =%8.1e', normA, normAr)
- fprintf('\nitn =%8g condA =%8.1e' , itn , condA )
- fprintf(' normx =%8.1e\n', normx)
- end
- */
- }
-
- private void statusDump(Vector x, double normA, double condA, double test1, double test2) {
- log.debug("{} {}", residualNorm, normalEquationResidual);
- log.debug("{} {}", iteration, x.get(0));
- log.debug("{} {}", test1, test2);
- log.debug("{} {}", normA, condA);
- }
-
- private static Vector zeros(int n) {
- return new DenseVector(n);
- }
-
- //-----------------------------------------------------------------------
- // stores v into the circular buffer localV
- //-----------------------------------------------------------------------
-
- private void localVEnqueue(Vector v) {
- if (localV.length > 0) {
- localV[localPointer] = v;
- localPointer = (localPointer + 1) % localV.length;
- }
- }
-
- //-----------------------------------------------------------------------
- // Perform local reorthogonalization of V
- //-----------------------------------------------------------------------
-
- private Vector localVOrtho(Vector v) {
- for (Vector old : localV) {
- if (old != null) {
- double x = v.dot(old);
- v = v.minus(old.times(x));
- }
- }
- return v;
- }
-
- private enum StopCode {
- CONTINUE("Not done"),
- TRIVIAL("The exact solution is x = 0"),
- CONVERGED("Ax - b is small enough, given atol, btol"),
- LEAST_SQUARE_CONVERGED("The least-squares solution is good enough, given atol"),
- CONDITION("The estimate of cond(Abar) has exceeded condition limit"),
- CONVERGED_MACHINE_TOLERANCE("Ax - b is small enough for this machine"),
- LEAST_SQUARE_CONVERGED_MACHINE_TOLERANCE("The least-squares solution is good enough for this machine"),
- CONDITION_MACHINE_TOLERANCE("Cond(Abar) seems to be too large for this machine"),
- ITERATION_LIMIT("The iteration limit has been reached");
-
- private final String message;
-
- StopCode(String message) {
- this.message = message;
- }
-
- public String getMessage() {
- return message;
- }
- }
-
- public void setAtolerance(double aTolerance) {
- this.aTolerance = aTolerance;
- }
-
- public void setBtolerance(double bTolerance) {
- this.bTolerance = bTolerance;
- }
-
- public void setConditionLimit(double conditionLimit) {
- this.conditionLimit = conditionLimit;
- }
-
- public void setIterationLimit(int iterationLimit) {
- this.iterationLimit = iterationLimit;
- }
-
- public void setLocalSize(int localSize) {
- this.localSize = localSize;
- }
-
- public double getLambda() {
- return lambda;
- }
-
- public double getAtolerance() {
- return aTolerance;
- }
-
- public double getBtolerance() {
- return bTolerance;
- }
-}

r***@apache.org

2018-06-27 14:51:31 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/list/package-info.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/list/package-info.java b/math/src/main/java/org/apache/mahout/math/list/package-info.java
deleted file mode 100644
index 43b5c4b..0000000
--- a/math/src/main/java/org/apache/mahout/math/list/package-info.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * <HTML>
- * <BODY>
- * Resizable lists holding objects or primitive data types such as <tt>int</tt>,
- * <tt>double</tt>, etc. For non-resizable lists (1-dimensional matrices) see
- * package <code>org.apache.mahout.math.matrix</code>.
- * <h1><a name="Overview"></a>Getting Started</h1>
- * <h2>1. Overview</h2>
- * The list package offers flexible object oriented abstractions modelling dynamically
- * resizing lists holding objects or primitive data types such as <tt>int</tt>,
- * <tt>double</tt>, etc. It is designed to be scalable in terms of performance
- * and memory requirements.
- * Features include: 
- * 
- * <ul>
- * <li>Lists operating on objects as well as all primitive data types such as <tt>int</tt>,
- * <tt>double</tt>, etc.
- * </li>
- * <li>Compact representations</li>
- * <li>A number of general purpose list operations including: adding, inserting,
- * removing, iterating, searching, sorting, extracting ranges and copying. All
- * operations are designed to perform well on mass data.
- * </li>
- * <li>Support for quick access to list elements. This is achieved by bounds-checking
- * and non-bounds-checking accessor methods as well as zero-copy transformations
- * to primitive arrays such as <tt>int[]</tt>, <tt>double[]</tt>, etc.
- * </li>
- * <li>Allows to use high level algorithms on primitive data types without any
- * space and time overhead. Operations on primitive arrays, Colt lists and JAL
- * algorithms can freely be mixed at zero copy overhead.
- * </li>
- * </ul>
- * File-based I/O can be achieved through the standard Java built-in serialization
- * mechanism. All classes implement the {@link java.io.Serializable} interface.
- * However, the toolkit is entirely decoupled from advanced I/O. It provides data
- * structures and algorithms only.
- * This toolkit borrows concepts and terminology from the Javasoft <a
- * href="http://www.javasoft.com/products/jdk/1.2/docs/guide/collections/index.html">
- * Collections framework</a> written by Josh Bloch and introduced in JDK 1.2.
- * <h2>2. Introduction</h2>
- * Lists are fundamental to virtually any application. Large scale resizable lists
- * are, for example, used in scientific computations, simulations database management
- * systems, to name just a few.
- * <h2></h2>
- * A list is a container holding elements that can be accessed via zero-based
- * indexes. Lists may be implemented in different ways (most commonly with arrays).
- * A resizable list automatically grows as elements are added. The lists of this
- * package do not automatically shrink. Shrinking needs to be triggered by explicitly
- * calling <tt>trimToSize()</tt> methods.
- * Growing policy: A list implemented with arrays initially has a certain
- * <tt>initialCapacity</tt> - per default 10 elements, but customizable upon instance
- * construction. As elements are added, this capacity may nomore be sufficient.
- * When a list is automatically grown, its capacity is expanded to <tt>1.5*currentCapacity</tt>.
- * Thus, excessive resizing (involving copying) is avoided.
- * <h4>Copying</h4>
- * 
- * Any list can be copied. A copy is equal to the original but entirely
- * independent of the original. So changes in the copy are not reflected in the
- * original, and vice-versa.
- * <h2>3. Organization of this package</h2>
- * Class naming follows the schema <tt><ElementType><ImplementationTechnique>List</tt>.
- * For example, we have a {@link org.apache.mahout.math.list.DoubleArrayList}, which is a list
- * holding <tt>double</tt> elements implemented with <tt>double</tt>[] arrays.
- * 
- * The classes for lists of a given value type are derived from a common abstract
- * base class tagged <tt>Abstract<ElementType></tt><tt>List</tt>. For example,
- * all lists operating on <tt>double</tt> elements are derived from
- * {@link org.apache.mahout.math.list.AbstractDoubleList},
- * which in turn is derived from an abstract base class tying together all lists
- * regardless of value type, {@link org.apache.mahout.math.list.AbstractList}. The abstract
- * base classes provide skeleton implementations for all but few methods. Experimental
- * data layouts (such as compressed, sparse, linked, etc.) can easily be implemented
- * and inherit a rich set of functionality. Have a look at the javadoc <a href="package-tree.html">tree
- * view</a> to get the broad picture.
- * <h2>4. Example usage</h2>
- * The following snippet fills a list, randomizes it, extracts the first half
- * of the elements, sums them up and prints the result. It is implemented entirely
- * with accessor methods.
- * <table>
- * <td class="PRE">
- * <pre>
- * int s = 1000000; AbstractDoubleList list = new DoubleArrayList();
- * for (int i=0; i<s; i++) { list.add((double)i); }
- * list.shuffle();
- * AbstractDoubleList part = list.partFromTo(0,list.size()/2 - 1);
- * double sum = 0.0;
- * for (int i=0; i<part.size(); i++) { sum += part.get(i); }
- * log.info(sum);
- * </pre>
- * </td>
- * </table>
- * For efficiency, all classes provide back doors to enable getting/setting the
- * backing array directly. In this way, the high level operations of these classes
- * can be used where appropriate, and one can switch to <tt>[]</tt>-array index
- * notations where necessary. The key methods for this are <tt>public <ElementType>[]
- * elements()</tt> and <tt>public void elements(<ElementType>[])</tt>. The
- * former trustingly returns the array it internally keeps to store the elements.
- * Holding this array in hand, we can use the <tt>[]</tt>-array operator to
- * perform iteration over large lists without needing to copy the array or paying
- * the performance penalty introduced by accessor methods. Alternatively any JAL
- * algorithm (or other algorithm) can operate on the returned primitive array.
- * The latter method forces a list to internally hold a user provided array. Using
- * this approach one can avoid needing to copy the elements into the list.
- * As a consequence, operations on primitive arrays, Colt lists and JAL algorithms
- * can freely be mixed at zero-copy overhead.
- * Note that such special treatment certainly breaks encapsulation. This functionality
- * is provided for performance reasons only and should only be used when absolutely
- * necessary. Here is the above example in mixed notation:
- * <table>
- * <td class="PRE">
- * <pre>
- * int s = 1000000; DoubleArrayList list = new DoubleArrayList(s); // list.size()==0, capacity==s
- * list.setSize(s); // list.size()==s double[] values = list.elements();
- * // zero copy, values.length==s for (int i=0; i<s; i++) { values[i]=(double)i; }
- * list.shuffle();
- * double sum = 0.0;
- * int limit = values.length/2;
- * for (int i=0; i<limit; i++) { sum += values[i]; }
- * log.info(sum);
- * </pre>
- * </td>
- * </table>
- * Or even more compact using lists as algorithm objects:
- * <table>
- * <td class="PRE">
- * <pre>
- * int s = 1000000; double[] values = new double[s];
- * for (int i=0; i<s; i++) { values[i]=(double)i; }
- * new DoubleArrayList(values).shuffle(); // zero-copy, shuffle via back door
- * double sum = 0.0;
- * int limit = values.length/2;
- * for (int i=0; i<limit; i++) { sum += values[i]; }
- * log.info(sum);
- * </pre>
- * </td>
- * </table>
- * 
- * <h2>5. Notes </h2>
- * The quicksorts and mergesorts are the JDK 1.2 V1.26 algorithms, modified as
- * necessary to operate on the given data types.
- * </BODY>
- * </HTML>
- */
-package org.apache.mahout.math.list;

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/map/HashFunctions.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/map/HashFunctions.java b/math/src/main/java/org/apache/mahout/math/map/HashFunctions.java
deleted file mode 100644
index b749307..0000000
--- a/math/src/main/java/org/apache/mahout/math/map/HashFunctions.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-
-/**
- * Provides various hash functions.
- */
-public final class HashFunctions {
-
- /**
- * Utility class pattern: all static members, no inheritance.
- */
- private HashFunctions() {
- }
-
- /**
- * Returns a hashcode for the specified value.
- *
- * @return a hash code value for the specified value.
- */
- public static int hash(char value) {
- return value;
- }
-
- /**
- * Returns a hashcode for the specified value.
- *
- * @return a hash code value for the specified value.
- */
- public static int hash(double value) {
- long bits = Double.doubleToLongBits(value);
- return (int) (bits ^ (bits >>> 32));
-
- //return (int) Double.doubleToLongBits(value*663608941.737);
- // this avoids excessive hashCollisions in the case values are of the form (1.0, 2.0, 3.0, ...)
- }
-
- /**
- * Returns a hashcode for the specified value.
- *
- * @return a hash code value for the specified value.
- */
- public static int hash(float value) {
- return Float.floatToIntBits(value * 663608941.737f);
- // this avoids excessive hashCollisions in the case values are of the form (1.0, 2.0, 3.0, ...)
- }
-
- /**
- * Returns a hashcode for the specified value.
- * The hashcode computation is similar to the last step
- * of MurMurHash3.
- *
- * @return a hash code value for the specified value.
- */
- public static int hash(int value) {
- int h = value;
- h ^= h >>> 16;
- h *= 0x85ebca6b;
- h ^= h >>> 13;
- h *= 0xc2b2ae35;
- h ^= h >>> 16;
- return h;
- }
-
- /**
- * Returns a hashcode for the specified value.
- *
- * @return a hash code value for the specified value.
- */
- public static int hash(long value) {
- return (int) (value ^ (value >> 32));
- /*
- value &= 0x7FFFFFFFFFFFFFFFL; // make it >=0 (0x7FFFFFFFFFFFFFFFL==Long.MAX_VALUE)
- int hashCode = 0;
- do hashCode = 31*hashCode + (int) (value%10);
- while ((value /= 10) > 0);
-
- return 28629151*hashCode; // spread even further; h*31^5
- */
- }
-
- /**
- * Returns a hashcode for the specified object.
- *
- * @return a hash code value for the specified object.
- */
- public static int hash(Object object) {
- return object == null ? 0 : object.hashCode();
- }
-
- /**
- * Returns a hashcode for the specified value.
- *
- * @return a hash code value for the specified value.
- */
- public static int hash(short value) {
- return value;
- }
-
- /**
- * Returns a hashcode for the specified value.
- *
- * @return a hash code value for the specified value.
- */
- public static int hash(boolean value) {
- return value ? 1231 : 1237;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/map/OpenHashMap.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/map/OpenHashMap.java b/math/src/main/java/org/apache/mahout/math/map/OpenHashMap.java
deleted file mode 100644
index 0efca4b..0000000
--- a/math/src/main/java/org/apache/mahout/math/map/OpenHashMap.java
+++ /dev/null
@@ -1,652 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright ï¿½ 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.mahout.math.function.ObjectObjectProcedure;
-import org.apache.mahout.math.function.ObjectProcedure;
-import org.apache.mahout.math.set.AbstractSet;
-import org.apache.mahout.math.set.OpenHashSet;
-
-/**
- * Open hash map. This implements Map, but it does not respect several aspects of the Map contract
- * that impose the very sorts of performance penalities that this class exists to avoid.
- * {@link #entrySet}, {@link #values}, and {@link #keySet()} do not return
- * collections that share storage with the main map, and changes to those returned objects
- * are not reflected in the container.
- **/
-public class OpenHashMap<K,V> extends AbstractSet implements Map<K,V> {
- protected static final byte FREE = 0;
- protected static final byte FULL = 1;
- protected static final byte REMOVED = 2;
- protected static final Object NO_KEY_VALUE = null;
-
- /** The hash table keys. */
- protected Object[] table;
-
- /** The hash table values. */
- protected Object[] values;
-
- /** The state of each hash table entry (FREE, FULL, REMOVED). */
- protected byte[] state;
-
- /** The number of table entries in state==FREE. */
- protected int freeEntries;
-
-
- /** Constructs an empty map with default capacity and default load factors. */
- public OpenHashMap() {
- this(DEFAULT_CAPACITY);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and default load factors.
- *
- * @param initialCapacity the initial capacity of the map.
- * @throws IllegalArgumentException if the initial capacity is less than zero.
- */
- public OpenHashMap(int initialCapacity) {
- this(initialCapacity, DEFAULT_MIN_LOAD_FACTOR, DEFAULT_MAX_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
- *
- * @param initialCapacity the initial capacity.
- * @param minLoadFactor the minimum load factor.
- * @param maxLoadFactor the maximum load factor.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- public OpenHashMap(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- setUp(initialCapacity, minLoadFactor, maxLoadFactor);
- }
-
- /** Removes all (key,value) associations from the receiver. Implicitly calls <tt>trimToSize()</tt>. */
- @Override
- public void clear() {
- Arrays.fill(this.state, FREE);
- distinct = 0;
- freeEntries = table.length; // delta
- trimToSize();
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @Override
- @SuppressWarnings("unchecked")
- public Object clone() {
- OpenHashMap<K,V> copy = (OpenHashMap<K,V>) super.clone();
- copy.table = copy.table.clone();
- copy.values = copy.values.clone();
- copy.state = copy.state.clone();
- return copy;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- @SuppressWarnings("unchecked")
- @Override
- public boolean containsKey(Object key) {
- return indexOfKey((K)key) >= 0;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified value.
- *
- * @return <tt>true</tt> if the receiver contains the specified value.
- */
- @SuppressWarnings("unchecked")
- @Override
- public boolean containsValue(Object value) {
- return indexOfValue((V)value) >= 0;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
- * method never need be called; it is for performance tuning only. Calling this method before <tt>put()</tt>ing a
- * large number of associations boosts performance, because the receiver will grow only once instead of potentially
- * many times and hash collisions get less probable.
- *
- * @param minCapacity the desired minimum capacity.
- */
- @Override
- public void ensureCapacity(int minCapacity) {
- if (table.length < minCapacity) {
- int newCapacity = nextPrime(minCapacity);
- rehash(newCapacity);
- }
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @SuppressWarnings("unchecked")
- public boolean forEachKey(ObjectProcedure<K> procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply((K)table[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Applies a procedure to each (key,value) pair of the receiver, if any. Iteration order is guaranteed to be
- * identical to the order used by method {@link #forEachKey(ObjectProcedure)}.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @SuppressWarnings("unchecked")
- public boolean forEachPair(ObjectObjectProcedure<K,V> procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply((K)table[i], (V)values[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the value associated with the specified key. It is often a good idea to first check with {@link
- * #containsKey(Object)} whether the given key has a value associated or not, i.e. whether there exists an association
- * for the given key or not.
- *
- * @param key the key to be searched for.
- * @return the value associated with the specified key; <tt>0</tt> if no such key is present.
- */
- @SuppressWarnings("unchecked")
- @Override
- public V get(Object key) {
- int i = indexOfKey((K)key);
- if (i < 0) {
- return null;
- } //not contained
- return (V)values[i];
- }
-
- /**
- * @param key the key to be added to the receiver.
- * @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
- * key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
- * at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
- * slot index.
- */
- protected int indexOfInsertion(K key) {
- Object[] tab = table;
- byte[] stat = state;
- int length = tab.length;
-
- int hash = key.hashCode() & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a removed or free slot, or if we find the key itself
- // do NOT skip over removed slots (yes, open addressing is like that...)
- while (stat[i] == FULL && !equalsMindTheNull(key, tab[i])) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (stat[i] == REMOVED) {
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- // assertion: there is at least one FREE slot.
- int j = i;
- while (stat[i] != FREE && (stat[i] == REMOVED || tab[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
- if (stat[i] == FREE) {
- i = j;
- }
- }
-
-
- if (stat[i] == FULL) {
- // key already contained at slot i.
- // return a negative number identifying the slot.
- return -i - 1;
- }
- // not already contained, should be inserted at slot i.
- // return a number >= 0 identifying the slot.
- return i;
- }
-
- /**
- * @param key the key to be searched in the receiver.
- * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
- */
- protected int indexOfKey(K key) {
- Object[] tab = table;
- byte[] stat = state;
- int length = tab.length;
-
- int hash = key.hashCode() & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- while (stat[i] != FREE && (stat[i] == REMOVED || !equalsMindTheNull(key, tab[i]))) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (stat[i] == FREE) {
- return -1;
- } // not found
- return i; //found, return index where key is contained
- }
-
- /**
- * @param value the value to be searched in the receiver.
- * @return the index where the value is contained in the receiver, returns -1 if the value was not found.
- */
- protected int indexOfValue(V value) {
- Object[] val = values;
- byte[] stat = state;
-
- for (int i = stat.length; --i >= 0;) {
- if (stat[i] == FULL && equalsMindTheNull(val[i], value)) {
- return i;
- }
- }
-
- return -1; // not found
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * This method can be used
- * to iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @SuppressWarnings("unchecked")
- public void keys(List<K> list) {
- list.clear();
-
-
- Object [] tab = table;
- byte[] stat = state;
-
- for (int i = tab.length; i-- > 0;) {
- if (stat[i] == FULL) {
- list.add((K)tab[i]);
- }
- }
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- @SuppressWarnings("unchecked")
- @Override
- public V put(K key, V value) {
- int i = indexOfInsertion(key);
- if (i < 0) { //already contained
- i = -i - 1;
- V previous = (V) this.values[i];
- this.values[i] = value;
- return previous;
- }
-
- if (this.distinct > this.highWaterMark) {
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return put(key, value);
- }
-
- this.table[i] = key;
- this.values[i] = value;
- if (this.state[i] == FREE) {
- this.freeEntries--;
- }
- this.state[i] = FULL;
- this.distinct++;
-
- if (this.freeEntries < 1) { //delta
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return null;
- }
-
- /**
- * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
- * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
- * mark.
- */
- @SuppressWarnings("unchecked")
- protected void rehash(int newCapacity) {
- int oldCapacity = table.length;
- //if (oldCapacity == newCapacity) return;
-
- Object[] oldTable = table;
- Object[] oldValues = values;
- byte[] oldState = state;
-
- Object[] newTable = new Object[newCapacity];
- Object[] newValues = new Object[newCapacity];
- byte[] newState = new byte[newCapacity];
-
- this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
- this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
-
- this.table = newTable;
- this.values = newValues;
- this.state = newState;
- this.freeEntries = newCapacity - this.distinct; // delta
-
- for (int i = oldCapacity; i-- > 0;) {
- if (oldState[i] == FULL) {
- Object element = oldTable[i];
- int index = indexOfInsertion((K)element);
- newTable[index] = element;
- newValues[index] = oldValues[i];
- newState[index] = FULL;
- }
- }
- }
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- @SuppressWarnings("unchecked")
- @Override
- public V remove(Object key) {
- int i = indexOfKey((K)key);
- if (i < 0) {
- return null;
- }
- // key not contained
- V removed = (V) values[i];
-
- this.state[i] = REMOVED;
- //this.values[i]=0; // delta
- this.distinct--;
-
- if (this.distinct < this.lowWaterMark) {
- int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return removed;
- }
-
- /**
- * Initializes the receiver.
- *
- * @param initialCapacity the initial capacity of the receiver.
- * @param minLoadFactor the minLoadFactor of the receiver.
- * @param maxLoadFactor the maxLoadFactor of the receiver.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- @Override
- protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- int capacity = initialCapacity;
- super.setUp(capacity, minLoadFactor, maxLoadFactor);
- capacity = nextPrime(capacity);
- if (capacity == 0) {
- capacity = 1;
- } // open addressing needs at least one FREE slot at any time.
-
- this.table = new Object[capacity];
- this.values = new Object[capacity];
- this.state = new byte[capacity];
-
- // memory will be exhausted long before this pathological case happens, anyway.
- this.minLoadFactor = minLoadFactor;
- if (capacity == PrimeFinder.LARGEST_PRIME) {
- this.maxLoadFactor = 1.0;
- } else {
- this.maxLoadFactor = maxLoadFactor;
- }
-
- this.distinct = 0;
- this.freeEntries = capacity; // delta
-
- // lowWaterMark will be established upon first expansion.
- // establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
- // After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
- // See ensureCapacity(...)
- this.lowWaterMark = 0;
- this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- // * 1.2 because open addressing's performance exponentially degrades beyond that point
- // so that even rehashing the table can take very long
- int newCapacity = nextPrime((int) (1 + 1.2 * size()));
- if (table.length > newCapacity) {
- rehash(newCapacity);
- }
- }
-
- /**
- * Access for unit tests.
- * @param capacity
- * @param minLoadFactor
- * @param maxLoadFactor
- */
- void getInternalFactors(int[] capacity,
- double[] minLoadFactor,
- double[] maxLoadFactor) {
- capacity[0] = table.length;
- minLoadFactor[0] = this.minLoadFactor;
- maxLoadFactor[0] = this.maxLoadFactor;
- }
-
- private class MapEntry implements Map.Entry<K,V> {
- private final K key;
- private final V value;
-
- MapEntry(K key, V value) {
- this.key = key;
- this.value = value;
- }
-
- @Override
- public K getKey() {
- return key;
- }
-
- @Override
- public V getValue() {
- return value;
- }
-
- @Override
- public V setValue(V value) {
- throw new UnsupportedOperationException("Map.Entry.setValue not supported for OpenHashMap");
- }
-
- }
-
- /**
- * Allocate a set to contain Map.Entry objects for the pairs and return it.
- */
- @Override
- public Set<java.util.Map.Entry<K,V>> entrySet() {
- final Set<Entry<K, V>> entries = new OpenHashSet<>();
- forEachPair(new ObjectObjectProcedure<K,V>() {
- @Override
- public boolean apply(K key, V value) {
- entries.add(new MapEntry(key, value));
- return true;
- }
- });
- return entries;
- }
-
- /**
- * Allocate a set to contain keys and return it.
- * This violates the 'backing' provisions of the map interface.
- */
- @Override
- public Set<K> keySet() {
- final Set<K> keys = new OpenHashSet<>();
- forEachKey(new ObjectProcedure<K>() {
- @Override
- public boolean apply(K element) {
- keys.add(element);
- return true;
- }
- });
- return keys;
- }
-
- @Override
- public void putAll(Map<? extends K,? extends V> m) {
- for (Map.Entry<? extends K, ? extends V> e : m.entrySet()) {
- put(e.getKey(), e.getValue());
- }
- }
-
- /**
- * Allocate a list to contain the values and return it.
- * This violates the 'backing' provision of the Map interface.
- */
- @Override
- public Collection<V> values() {
- final List<V> valueList = new ArrayList<>();
- forEachPair(new ObjectObjectProcedure<K,V>() {
- @Override
- public boolean apply(K key, V value) {
- valueList.add(value);
- return true;
- }
- });
- return valueList;
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof OpenHashMap)) {
- return false;
- }
- final OpenHashMap<K,V> o = (OpenHashMap<K,V>) obj;
- if (o.size() != size()) {
- return false;
- }
- final boolean[] equal = new boolean[1];
- equal[0] = true;
- forEachPair(new ObjectObjectProcedure<K,V>() {
- @Override
- public boolean apply(K key, V value) {
- Object ov = o.get(key);
- if (!value.equals(ov)) {
- equal[0] = false;
- return false;
- }
- return true;
- }
- });
- return equal[0];
- }
-
- @Override
- public String toString() {
- final StringBuilder sb = new StringBuilder();
- sb.append('{');
- forEachPair(new ObjectObjectProcedure<K,V>() {
- @Override
- public boolean apply(K key, V value) {
- sb.append('[');
- sb.append(key);
- sb.append(" -> ");
- sb.append(value);
- sb.append("] ");
- return true;
- }
- });
- sb.append('}');
- return sb.toString();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/map/PrimeFinder.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/map/PrimeFinder.java b/math/src/main/java/org/apache/mahout/math/map/PrimeFinder.java
deleted file mode 100644
index b02611e..0000000
--- a/math/src/main/java/org/apache/mahout/math/map/PrimeFinder.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.map;
-
-import java.util.Arrays;
-
-/**
- * Not of interest for users; only for implementors of hashtables.
- * Used to keep hash table capacities prime numbers.
- *
- * Choosing prime numbers as hash table capacities is a good idea to keep them working fast,
- * particularly under hash table expansions.
- *
- * However, JDK 1.2, JGL 3.1 and many other toolkits do nothing to keep capacities prime.
- * This class provides efficient means to choose prime capacities.
- *
- * Choosing a prime is <tt>O(log 300)</tt> (binary search in a list of 300 int's).
- * Memory requirements: 1 KB static memory.
- *
- */
-public final class PrimeFinder {
-
- /** The largest prime this class can generate; currently equal to <tt>Integer.MAX_VALUE</tt>. */
- public static final int LARGEST_PRIME = Integer.MAX_VALUE; //yes, it is prime.
-
- /**
- * The prime number list consists of 11 chunks. Each chunk contains prime numbers. A chunk starts with a prime P1. The
- * next element is a prime P2. P2 is the smallest prime for which holds: P2 >= 2*P1. The next element is P3, for which
- * the same holds with respect to P2, and so on.
- *
- * Chunks are chosen such that for any desired capacity >= 1000 the list includes a prime number <= desired capacity *
- * 1.11 (11%). For any desired capacity >= 200 the list includes a prime number <= desired capacity * 1.16 (16%). For
- * any desired capacity >= 16 the list includes a prime number <= desired capacity * 1.21 (21%).
- *
- * Therefore, primes can be retrieved which are quite close to any desired capacity, which in turn avoids wasting
- * memory. For example, the list includes 1039,1117,1201,1277,1361,1439,1523,1597,1759,1907,2081. So if you need a
- * prime >= 1040, you will find a prime <= 1040*1.11=1154.
- *
- * Chunks are chosen such that they are optimized for a hashtable growthfactor of 2.0; If your hashtable has such a
- * growthfactor then, after initially "rounding to a prime" upon hashtable construction, it will later expand to prime
- * capacities such that there exist no better primes.
- *
- * In total these are about 32*10=320 numbers -> 1 KB of static memory needed. If you are stingy, then delete every
- * second or fourth chunk.
- */
-
- private static final int[] PRIME_CAPACITIES = {
- //chunk #0
- LARGEST_PRIME,
-
- //chunk #1
- 5, 11, 23, 47, 97, 197, 397, 797, 1597, 3203, 6421, 12853, 25717, 51437, 102877, 205759,
- 411527, 823117, 1646237, 3292489, 6584983, 13169977, 26339969, 52679969, 105359939,
- 210719881, 421439783, 842879579, 1685759167,
-
- //chunk #2
- 433, 877, 1759, 3527, 7057, 14143, 28289, 56591, 113189, 226379, 452759, 905551, 1811107,
- 3622219, 7244441, 14488931, 28977863, 57955739, 115911563, 231823147, 463646329, 927292699,
- 1854585413,
-
- //chunk #3
- 953, 1907, 3821, 7643, 15287, 30577, 61169, 122347, 244703, 489407, 978821, 1957651, 3915341,
- 7830701, 15661423, 31322867, 62645741, 125291483, 250582987, 501165979, 1002331963,
- 2004663929,
-
- //chunk #4
- 1039, 2081, 4177, 8363, 16729, 33461, 66923, 133853, 267713, 535481, 1070981, 2141977, 4283963,
- 8567929, 17135863, 34271747, 68543509, 137087021, 274174111, 548348231, 1096696463,
-
- //chunk #5
- 31, 67, 137, 277, 557, 1117, 2237, 4481, 8963, 17929, 35863, 71741, 143483, 286973, 573953,
- 1147921, 2295859, 4591721, 9183457, 18366923, 36733847, 73467739, 146935499, 293871013,
- 587742049, 1175484103,
-
- //chunk #6
- 599, 1201, 2411, 4831, 9677, 19373, 38747, 77509, 155027, 310081, 620171, 1240361, 2480729,
- 4961459, 9922933, 19845871, 39691759, 79383533, 158767069, 317534141, 635068283, 1270136683,
-
- //chunk #7
- 311, 631, 1277, 2557, 5119, 10243, 20507, 41017, 82037, 164089, 328213, 656429, 1312867,
- 2625761, 5251529, 10503061, 21006137, 42012281, 84024581, 168049163, 336098327, 672196673,
- 1344393353,
-
- //chunk #8
- 3, 7, 17, 37, 79, 163, 331, 673, 1361, 2729, 5471, 10949, 21911, 43853, 87719, 175447, 350899,
- 701819, 1403641, 2807303, 5614657, 11229331, 22458671, 44917381, 89834777, 179669557,
- 359339171, 718678369, 1437356741,
-
- //chunk #9
- 43, 89, 179, 359, 719, 1439, 2879, 5779, 11579, 23159, 46327, 92657, 185323, 370661, 741337,
- 1482707, 2965421, 5930887, 11861791, 23723597, 47447201, 94894427, 189788857, 379577741,
- 759155483, 1518310967,
-
- //chunk #10
- 379, 761, 1523, 3049, 6101, 12203, 24407, 48817, 97649, 195311, 390647, 781301, 1562611,
- 3125257, 6250537, 12501169, 25002389, 50004791, 100009607, 200019221, 400038451, 800076929,
- 1600153859
- };
-
-
- static { //initializer
- // The above prime numbers are formatted for human readability.
- // To find numbers fast, we sort them once and for all.
-
- Arrays.sort(PRIME_CAPACITIES);
- }
-
- /** Makes this class non instantiable, but still let's others inherit from it. */
- private PrimeFinder() {
- }
-
- /**
- * Returns a prime number which is {@code <= desiredCapacity} and very close to {@code desiredCapacity}
- * (within 11% if {@code desiredCapacity <= 1000}).
- *
- * @param desiredCapacity the capacity desired by the user.
- * @return the capacity which should be used for a hashtable.
- */
- public static int nextPrime(int desiredCapacity) {
- int i = java.util.Arrays.binarySearch(PRIME_CAPACITIES, desiredCapacity);
- if (i < 0) {
- // desired capacity not found, choose next prime greater than desired capacity
- i = -i - 1; // remember the semantics of binarySearch...
- }
- return PRIME_CAPACITIES[i];
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java b/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java
deleted file mode 100644
index 6a7cef8..0000000
--- a/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-/**
- * Status: Experimental; Do not use for production yet. Hash map holding (key,value) associations of type
- * <tt>(int-->int)</tt>; Automatically grows and shrinks as needed; Implemented using open addressing with double
- * hashing. First see the <a href="package-summary.html">package summary</a> and javadoc <a
- * href="package-tree.html">tree view</a> to get the broad picture.
- *
- * Implements open addressing with double hashing, using "Brent's variation". Brent's variation slows insertions a bit
- * down (not much) but reduces probes (collisions) for successful searches, in particular for large load factors. (It
- * does not improve unsuccessful searches.) See D. Knuth, Searching and Sorting, 3rd ed., p.533-545
- *
- * @author ***@cern.ch
- * @version 1.0, 09/24/99
- * @see java.util.HashMap
- */
-class QuickOpenIntIntHashMap extends OpenIntIntHashMap {
- //public int totalProbesSaved = 0; // benchmark only
-
- /** Constructs an empty map with default capacity and default load factors. */
- QuickOpenIntIntHashMap() {
- this(DEFAULT_CAPACITY);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and default load factors.
- *
- * @param initialCapacity the initial capacity of the map.
- * @throws IllegalArgumentException if the initial capacity is less than zero.
- */
- QuickOpenIntIntHashMap(int initialCapacity) {
- this(initialCapacity, DEFAULT_MIN_LOAD_FACTOR, DEFAULT_MAX_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
- *
- * @param initialCapacity the initial capacity.
- * @param minLoadFactor the minimum load factor.
- * @param maxLoadFactor the maximum load factor.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- QuickOpenIntIntHashMap(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- setUp(initialCapacity, minLoadFactor, maxLoadFactor);
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- @Override
- public boolean put(int key, int value) {
- /*
- This is open addressing with double hashing, using "Brent's variation".
- Brent's variation slows insertions a bit down (not much) but reduces probes (collisions) for successful searches,
- in particular for large load factors.
- (It does not improve unsuccessful searches.)
- See D. Knuth, Searching and Sorting, 3rd ed., p.533-545
-
- h1(key) = hash % M
- h2(key) = decrement = Max(1, hash/M % M)
- M is prime = capacity = table.length
- probing positions are table[(h1-j*h2) % M] for j=0,1,...
- (M and h2 could also be chosen differently, but h2 is required to be relative prime to M.)
- */
-
- int[] tab = table;
- byte[] stat = state;
- int length = tab.length;
-
- int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a removed or free slot, or if we find the key itself
- // do NOT skip over removed slots (yes, open addressing is like that...)
- //int comp = comparisons;
- int t = 0; // the number of probes
- int p0 = i; // the first position to probe
- while (stat[i] == FULL && tab[i] != key) {
- t++;
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
- if (stat[i] == FULL) {
- // key already contained at slot i.
- this.values[i] = value;
- return false;
- }
- // not already contained, should be inserted at slot i.
-
- if (this.distinct > this.highWaterMark) {
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return put(key, value);
- }
-
- /*
- Brent's variation does a local reorganization to reduce probes. It essentially means:
- We test whether it is possible to move the association we probed first (table[p0]) out of the way.
- If this is possible, it will reduce probes for the key to be inserted, since it takes its place;
- it gets hit earlier.
- However, future probes for the key that we move out of the way will increase.
- Thus we only move it out of the way, if we have a net gain, that is, if we save more probes than we loose.
- For the first probe we safe more than we loose if the number of probes we needed was >=2 (t>=2).
- If the first probe cannot be moved out of the way, we try the next probe (p1).
- Now we safe more than we loose if t>=3.
- We repeat this until we find that we cannot gain or that we can indeed move p(x) out of the way.
-
- Note: Under the great majority of insertions t<=1, so the loop is entered very infrequently.
- */
- while (t > 1) {
- int key0 = tab[p0];
- hash = HashFunctions.hash(key0) & 0x7FFFFFFF;
- decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
- int pc = p0 - decrement; // pc = (p0-j*decrement) % M, j=1,2,..
- if (pc < 0) {
- pc += length;
- }
-
- if (stat[pc] != FREE) { // not a free slot, continue searching for free slot to move to, or break.
- p0 = pc;
- t--;
- } else { // free or removed slot found, now move...
- tab[pc] = key0;
- stat[pc] = FULL;
- values[pc] = values[p0];
- i = p0; // prepare to insert: table[p0]=key
- t = 0; // break loop
- }
- }
-
- this.table[i] = key;
- this.values[i] = value;
- if (this.state[i] == FREE) {
- this.freeEntries--;
- }
- this.state[i] = FULL;
- this.distinct++;
-
- if (this.freeEntries < 1) { //delta
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
- * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
- * mark.
- */
- @Override
- protected void rehash(int newCapacity) {
- int oldCapacity = table.length;
- //if (oldCapacity == newCapacity) return;
-
- int[] oldTable = table;
- int[] oldValues = values;
- byte[] oldState = state;
-
- int[] newTable = new int[newCapacity];
- int[] newValues = new int[newCapacity];
- byte[] newState = new byte[newCapacity];
-
- this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
- this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
-
- this.table = newTable;
- this.values = newValues;
- this.state = newState;
- this.freeEntries = newCapacity - this.distinct; // delta
-
- int tmp = this.distinct;
- this.distinct = Integer.MIN_VALUE; // switch of watermarks
- for (int i = oldCapacity; i-- > 0;) {
- if (oldState[i] == FULL) {
- put(oldTable[i], oldValues[i]);
- /*
- int element = oldTable[i];
- int index = indexOfInsertion(element);
- newTable[index]=element;
- newValues[index]=oldValues[i];
- newState[index]=FULL;
- */
- }
- }
- this.distinct = tmp;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/map/package-info.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/map/package-info.java b/math/src/main/java/org/apache/mahout/math/map/package-info.java
deleted file mode 100644
index 9356f45..0000000
--- a/math/src/main/java/org/apache/mahout/math/map/package-info.java
+++ /dev/null
@@ -1,250 +0,0 @@
-/**
- * <HTML>
- * <BODY>
- * Automatically growing and shrinking maps holding objects or primitive
- * data types such as <tt>int</tt>, <tt>double</tt>, etc. Currently all maps are
- * based upon hashing.
- * <h2><a name="Overview"></a>1. Overview</h2>
- * The map package offers flexible object oriented abstractions modelling automatically
- * resizing maps. It is designed to be scalable in terms of performance and memory
- * requirements.
- * Features include: 
- * 
- * <ul>
- * <li>Maps operating on objects as well as all primitive data types such as <code>int</code>,
- * <code>double</code>, etc.
- * </li>
- * <li>Compact representations</li>
- * <li>Support for quick access to associations</li>
- * <li>A number of general purpose map operations</li>
- * </ul>
- * File-based I/O can be achieved through the standard Java built-in serialization
- * mechanism. All classes implement the {@link java.io.Serializable} interface.
- * However, the toolkit is entirely decoupled from advanced I/O. It provides data
- * structures and algorithms only.
- * This toolkit borrows some terminology from the Javasoft <a
- * href="http://www.javasoft.com/products/jdk/1.2/docs/guide/collections/index.html">
- * Collections framework</a> written by Josh Bloch and introduced in JDK 1.2.
- * <h2>2. Introduction</h2>
- * A map is an associative container that manages a set of (key,value) pairs.
- * It is useful for implementing a collection of one-to-one mappings. A (key,value)
- * pair is called an association. A value can be looked up up via its key.
- * Associations can quickly be set, removed and retrieved. They are stored in a
- * hashing structure based on the hash code of their keys, which is obtained by
- * using a hash function. 
- * A map can, for example, contain <tt>Name-->Location</tt> associations like
- * <tt>{("Pete", "Geneva"), ("Steve", "Paris"), ("Robert", "New York")}</tt> used
- * in address books or <tt>Index-->Value</tt> mappings like <tt>{(0, 100), (3,
- * 1000), (100000, 70)}</tt> representing sparse lists or matrices. For example
- * this could mean at index 0 we have a value of 100, at index 3 we have a value
- * of 1000, at index 1000000 we have a value of 70, and at all other indexes we
- * have a value of, say, zero. Another example is a map of IP addresses to domain
- * names (DNS). Maps can also be useful to represent multi sets, that is,
- * sets where elements can occur more than once. For multi sets one would have
- * <tt>Value-->Frequency</tt> mappings like <tt>{(100, 1), (50, 1000), (101, 3))}</tt>
- * meaning element 100 occurs 1 time, element 50 occurs 1000 times, element 101
- * occurs 3 times. Further, maps can also manage <tt>ObjectIdentifier-->Object</tt>
- * mappings like <tt>{(12, obj1), (7, obj2), (10000, obj3), (9, obj4)}</tt> used
- * in Object Databases.
- * A map cannot contain two or more equal keys; a key can map to at most
- * one value. However, more than one key can map to identical values. For primitive
- * data types "equality" of keys is defined as identity (operator <tt>==</tt>).
- * For maps using <tt>Object</tt> keys, the meaning of "equality" can be specified
- * by the user upon instance construction. It can either be defined to be identity
- * (operator <tt>==</tt>) or to be given by the method {@link java.lang.Object#equals(Object)}.
- * Associations of kind <tt>(AnyType,Object)</tt> can be of the form <tt>(AnyKey,null)
- * </tt>, i.e. values can be <tt>null</tt>.
- * The classes of this package make no guarantees as to the order of the elements
- * returned by iterators; in particular, they do not guarantee that the order will
- * remain constant over time.
- * <h2></h2>
- * <h4>Copying</h4>
- * 
- * Any map can be copied. A copy is equal to the original but entirely
- * independent of the original. So changes in the copy are not reflected in the
- * original, and vice-versa.
- * <h2>3. Package organization</h2>
- * For most primitive data types and for objects there exists a separate map version.
- * All versions are just the same, except that they operate on different data types.
- * Colt includes two kinds of implementations for maps: The two different implementations
- * are tagged Chained and Open.
- * Note: Chained is no more included. Wherever it is mentioned it is of historic interest only.
- * <ul>
- * <li>Chained uses extendible separate chaining with chains holding unsorted
- * dynamically linked collision lists.
- * <li>Open uses extendible open addressing with double hashing.
- * </ul>
- * Class naming follows the schema <tt><Implementation><KeyType><ValueType>HashMap</tt>.
- * For example, a {@link org.apache.mahout.math.map.OpenIntDoubleHashMap} holds <tt>(int-->double)</tt>
- * associations and is implemented with open addressing. A {@link org.apache.mahout.math.map.OpenIntObjectHashMap}
- * holds <tt>(int-->Object)</tt> associations and is implemented with open addressing.
- * 
- * The classes for maps of a given (key,value) type are derived from a common
- * abstract base class tagged <tt>Abstract<KeyType><ValueType></tt><tt>Map</tt>.
- * For example, all maps operating on <tt>(int-->double)</tt> associations are
- * derived from {@link org.apache.mahout.math.map.AbstractIntDoubleMap}, which in turn is derived
- * from an abstract base class tying together all maps regardless of assocation
- * type, {@link org.apache.mahout.math.set.AbstractSet}. The abstract base classes provide skeleton
- * implementations for all but few methods. Experimental layouts (such as chaining,
- * open addressing, extensible hashing, red-black-trees, etc.) can easily be implemented
- * and inherit a rich set of functionality. Have a look at the javadoc <a href="package-tree.html">tree
- * view</a> to get the broad picture.
- * <h2>4. Example usage</h2>
- * <TABLE>
- * <TD CLASS="PRE">
- * <PRE>
- * int[] keys = {0 , 3 , 100000, 9 };
- * double[] values = {100.0, 1000.0, 70.0 , 71.0};
- * AbstractIntDoubleMap map = new OpenIntDoubleHashMap();
- * // add several associations
- * for (int i=0; i < keys.length; i++) map.put(keys[i], values[i]);
- * log.info("map="+map);
- * log.info("size="+map.size());
- * log.info(map.containsKey(3));
- * log.info("get(3)="+map.get(3));
- * log.info(map.containsKey(4));
- * log.info("get(4)="+map.get(4));
- * log.info(map.containsValue(71.0));
- * log.info("keyOf(71.0)="+map.keyOf(71.0));
- * // remove one association
- * map.removeKey(3);
- * log.info("\nmap="+map);
- * log.info(map.containsKey(3));
- * log.info("get(3)="+map.get(3));
- * log.info(map.containsValue(1000.0));
- * log.info("keyOf(1000.0)="+map.keyOf(1000.0));
- * // clear
- * map.clear();
- * log.info("\nmap="+map);
- * log.info("size="+map.size());
- * </PRE>
- * </TD>
- * </TABLE>
- * yields the following output
- * <TABLE>
- * <TD CLASS="PRE">
- * <PRE>
- * map=[0->100.0, 3->1000.0, 9->71.0, 100000->70.0]
- * size=4
- * true
- * get(3)=1000.0
- * false
- * get(4)=0.0
- * true
- * keyOf(71.0)=9
- * map=[0->100.0, 9->71.0, 100000->70.0]
- * false
- * get(3)=0.0
- * false
- * keyOf(1000.0)=-2147483648
- * map=[]
- * size=0
- * </PRE>
- * </TD>
- * </TABLE>
- * <h2> 5. Notes </h2>
- * 
- * Note that implementations are not synchronized.
- * 
- * Choosing efficient parameters for hash maps is not always easy.
- * However, since parameters determine efficiency and memory requirements, here is a quick guide how to choose them.
- * If your use case does not heavily operate on hash maps but uses them just because they provide
- * convenient functionality, you can safely skip this section.
- * For those of you who care, read on.
- * 
- * There are three parameters that can be customized upon map construction: <tt>initialCapacity</tt>,
- * <tt>minLoadFactor</tt> and <tt>maxLoadFactor</tt>.
- * The more memory one can afford, the faster a hash map.
- * The hash map's capacity is the maximum number of associations that can be added without needing to allocate new
- * internal memory.
- * A larger capacity means faster adding, searching and removing.
- * The <tt>initialCapacity</tt> corresponds to the capacity used upon instance construction.
- * 
- * The <tt>loadFactor</tt> of a hash map measures the degree of "fullness".
- * It is given by the number of assocations (<tt>size()</tt>)
- * divided by the hash map capacity <tt>(0.0 <= loadFactor <= 1.0)</tt>.
- * The more associations are added, the larger the loadFactor and the more hash map performance degrades.
- * Therefore, when the loadFactor exceeds a customizable threshold (<tt>maxLoadFactor</tt>), the hash map is
- * automatically grown.
- * In such a way performance degradation can be avoided.
- * Similarly, when the loadFactor falls below a customizable threshold (<tt>minLoadFactor</tt>), the hash map is
- * automatically shrinked.
- * In such a way excessive memory consumption can be avoided.
- * Automatic resizing (both growing and shrinking) obeys the following invariant:
- * 
- * <tt>capacity * minLoadFactor <= size() <= capacity * maxLoadFactor</tt>
- * The term <tt>capacity * minLoadFactor</tt> is called the low water mark,
- * <tt>capacity * maxLoadFactor</tt> is called the high water mark. In other
- * words, the number of associations may vary within the water mark constraints.
- * When it goes out of range, the map is automatically resized and memory consumption
- * changes proportionally.
- * <ul>
- * <li>To tune for memory at the expense of performance, both increase <tt>minLoadFactor</tt> and
- * <tt>maxLoadFactor</tt>.
- * <li>To tune for performance at the expense of memory, both decrease <tt>minLoadFactor</tt> and
- * <tt>maxLoadFactor</tt>.
- * As as special case set <tt>minLoadFactor=0</tt> to avoid any automatic shrinking.
- * </ul>
- * Resizing large hash maps can be time consuming, <tt>O(size())</tt>, and should be avoided if possible (maintaining
- * primes is not the reason).
- * Unnecessary growing operations can be avoided if the number of associations is known before they are added, or can be
- * estimated.
- * In such a case good parameters are as follows:
- * 
- * For chaining:
- * Set the <tt>initialCapacity = 1.4*expectedSize</tt> or greater.
- * Set the <tt>maxLoadFactor = 0.8</tt> or greater.
- * 
- * For open addressing:
- * Set the <tt>initialCapacity = 2*expectedSize</tt> or greater. Alternatively call <tt>ensureCapacity(...)</tt>.
- * Set the <tt>maxLoadFactor = 0.5</tt>.
- * Never set <tt>maxLoadFactor > 0.55</tt>; open addressing exponentially slows down beyond that point.
- * 
- * In this way the hash map will never need to grow and still stay fast.
- * It is never a good idea to set <tt>maxLoadFactor < 0.1</tt>,
- * because the hash map would grow too often.
- * If it is entirelly unknown how many associations the application will use,
- * the default constructor should be used. The map will grow and shrink as needed.
- * 
- * Comparision of chaining and open addressing
- * Chaining is faster than open addressing, when assuming unconstrained memory
- * consumption. Open addressing is more space efficient than chaining, because
- * it does not create entry objects but uses primitive arrays which are considerably
- * smaller. Entry objects consume significant amounts of memory compared to the
- * information they actually hold. Open addressing also poses no problems to the
- * garbage collector. In contrast, chaining can create millions of entry objects
- * which are linked; a nightmare for any garbage collector. In addition, entry
- * object creation is a bit slow. 
- * Therefore, with the same amount of memory, or even less memory, hash maps with
- * larger capacity can be maintained under open addressing, which yields smaller
- * loadFactors, which in turn keeps performance competitive with chaining. In our
- * benchmarks, using significantly less memory, open addressing usually is not
- * more than 1.2-1.5 times slower than chaining.
- * Further readings:
- * Knuth D., The Art of Computer Programming: Searching and Sorting, 3rd ed.
- * Griswold W., Townsend G., The Design and Implementation of Dynamic Hashing for Sets and Tables in Icon,
- * Software - Practice and Experience, Vol. 23(4), 351-367 (April 1993).
- * Larson P., Dynamic hash tables, Comm. of the ACM, 31, (4), 1988.
- * 
- * Performance:
- * 
- * Time complexity:
- * The classes offer expected time complexity <tt>O(1)</tt> (i.e. constant time) for the basic operations
- * <tt>put</tt>, <tt>get</tt>, <tt>removeKey</tt>, <tt>containsKey</tt> and <tt>size</tt>,
- * assuming the hash function disperses the elements properly among the buckets.
- * Otherwise, pathological cases, although highly improbable, can occur, degrading performance to <tt>O(N)</tt> in the
- * worst case.
- * Operations <tt>containsValue</tt> and <tt>keyOf</tt> are <tt>O(N)</tt>.
- * 
- * Memory requirements for open addressing:
- * worst case: <tt>memory [bytes] = (1/minLoadFactor) * size() * (1 + sizeOf(key) + sizeOf(value))</tt>.
- * best case: <tt>memory [bytes] = (1/maxLoadFactor) * size() * (1 + sizeOf(key) + sizeOf(value))</tt>.
- * Where <tt>sizeOf(int) = 4</tt>, <tt>sizeOf(double) = 8</tt>, <tt>sizeOf(Object) = 4</tt>, etc.
- * Thus, an <tt>OpenIntIntHashMap</tt> with minLoadFactor=0.25 and maxLoadFactor=0.5 and 1000000 associations uses
- * between 17 MB and 34 MB.
- * The same map with 1000 associations uses between 17 and 34 KB.
- * 
- * </BODY>
- * </HTML>
- */
-package org.apache.mahout.math.map;

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/package-info.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/package-info.java b/math/src/main/java/org/apache/mahout/math/package-info.java
deleted file mode 100644
index de664f0..0000000
--- a/math/src/main/java/org/apache/mahout/math/package-info.java
+++ /dev/null
@@ -1,4 +0,0 @@
-/**
- * Core base classes; Operations on primitive arrays such as sorting, partitioning and permuting.
- */
-package org.apache.mahout.math;

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/AbstractSamplerFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/AbstractSamplerFunction.java b/math/src/main/java/org/apache/mahout/math/random/AbstractSamplerFunction.java
deleted file mode 100644
index d657fd9..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/AbstractSamplerFunction.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import org.apache.mahout.math.function.DoubleFunction;
-
-/**
- * This shim allows samplers to be used to initialize vectors.
- */
-public abstract class AbstractSamplerFunction extends DoubleFunction implements Sampler<Double> {
- /**
- * Apply the function to the argument and return the result
- *
- * @param ignored Ignored argument
- * @return A sample from this distribution.
- */
- @Override
- public double apply(double ignored) {
- return sample();
- }
-
- @Override
- public abstract Double sample();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/ChineseRestaurant.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/ChineseRestaurant.java b/math/src/main/java/org/apache/mahout/math/random/ChineseRestaurant.java
deleted file mode 100644
index 8127b92..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/ChineseRestaurant.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.list.DoubleArrayList;
-
-import java.util.Random;
-
-/**
- *
- * Generates samples from a generalized Chinese restaurant process (or Pittman-Yor process).
- *
- * The number of values drawn exactly once will asymptotically be equal to the discount parameter
- * as the total number of draws T increases without bound. The number of unique values sampled will
- * increase as O(alpha * log T) if discount = 0 or O(alpha * T^discount) for discount > 0.
- */
-public final class ChineseRestaurant implements Sampler<Integer> {
-
- private final double alpha;
- private double weight = 0;
- private double discount = 0;
- private final DoubleArrayList weights = new DoubleArrayList();
- private final Random rand = RandomUtils.getRandom();
-
- /**
- * Constructs a Dirichlet process sampler. This is done by setting discount = 0.
- * @param alpha The strength parameter for the Dirichlet process.
- */
- public ChineseRestaurant(double alpha) {
- this(alpha, 0);
- }
-
- /**
- * Constructs a Pitman-Yor sampler.
- *
- * @param alpha The strength parameter that drives the number of unique values as a function of draws.
- * @param discount The discount parameter that drives the percentage of values that occur once in a large sample.
- */
- public ChineseRestaurant(double alpha, double discount) {
- Preconditions.checkArgument(alpha > 0, "Strength Parameter, alpha must be greater then 0!");
- Preconditions.checkArgument(discount >= 0 && discount <= 1, "Must be: 0 <= discount <= 1");
- this.alpha = alpha;
- this.discount = discount;
- }
-
- @Override
- public Integer sample() {
- double u = rand.nextDouble() * (alpha + weight);
- for (int j = 0; j < weights.size(); j++) {
- // select existing options with probability (w_j - d) / (alpha + w)
- if (u < weights.get(j) - discount) {
- weights.set(j, weights.get(j) + 1);
- weight++;
- return j;
- } else {
- u -= weights.get(j) - discount;
- }
- }
-
- // if no existing item selected, pick new item with probability (alpha - d*t) / (alpha + w)
- // where t is number of pre-existing cases
- weights.add(1);
- weight++;
- return weights.size() - 1;
- }
-
- /**
- * @return the number of unique values that have been returned.
- */
- public int size() {
- return weights.size();
- }
-
- /**
- * @return the number draws so far.
- */
- public int count() {
- return (int) weight;
- }
-
- /**
- * @param j Which value to test.
- * @return The number of times that j has been returned so far.
- */
- public int count(int j) {
- Preconditions.checkArgument(j >= 0);
-
- if (j < weights.size()) {
- return (int) weights.get(j);
- } else {
- return 0;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/Empirical.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/Empirical.java b/math/src/main/java/org/apache/mahout/math/random/Empirical.java
deleted file mode 100644
index 78bfec5..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/Empirical.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.common.RandomUtils;
-
-import java.util.Random;
-
-/**
- * Samples from an empirical cumulative distribution.
- */
-public final class Empirical extends AbstractSamplerFunction {
- private final Random gen;
- private final boolean exceedMinimum;
- private final boolean exceedMaximum;
-
- private final double[] x;
- private final double[] y;
- private final int n;
-
- /**
- * Sets up a sampler for a specified empirical cumulative distribution function. The distribution
- * can have optional exponential tails on either or both ends, but otherwise does a linear
- * interpolation between known points.
- *
- * @param exceedMinimum Should we generate samples less than the smallest quantile (i.e. generate a left tail)?
- * @param exceedMaximum Should we generate samples greater than the largest observed quantile (i.e. generate a right
- * tail)?
- * @param samples The number of samples observed to get the quantiles.
- * @param ecdf Alternating values that represent which percentile (in the [0..1] range)
- * and values. For instance, if you have the min, median and max of 1, 3, 10
- * you should pass 0.0, 1, 0.5, 3, 1.0, 10. Note that the list must include
- * the 0-th (1.0-th) quantile if the left (right) tail is not allowed.
- */
- public Empirical(boolean exceedMinimum, boolean exceedMaximum, int samples, double... ecdf) {
- Preconditions.checkArgument(ecdf.length % 2 == 0, "ecdf must have an even count of values");
- Preconditions.checkArgument(samples >= 3, "Sample size must be >= 3");
-
- // if we can't exceed the observed bounds, then we have to be given the bounds.
- Preconditions.checkArgument(exceedMinimum || ecdf[0] == 0);
- Preconditions.checkArgument(exceedMaximum || ecdf[ecdf.length - 2] == 1);
-
- gen = RandomUtils.getRandom();
-
- n = ecdf.length / 2;
- x = new double[n];
- y = new double[n];
-
- double lastX = ecdf[1];
- double lastY = ecdf[0];
- for (int i = 0; i < ecdf.length; i += 2) {
- // values have to be monotonic increasing
- Preconditions.checkArgument(i == 0 || ecdf[i + 1] > lastY);
- y[i / 2] = ecdf[i + 1];
- lastY = y[i / 2];
-
- // quantiles have to be in [0,1] and be monotonic increasing
- Preconditions.checkArgument(ecdf[i] >= 0 && ecdf[i] <= 1);
- Preconditions.checkArgument(i == 0 || ecdf[i] > lastX);
-
- x[i / 2] = ecdf[i];
- lastX = x[i / 2];
- }
-
- // squeeze a bit to allow for unobserved tails
- double x0 = exceedMinimum ? 0.5 / samples : 0;
- double x1 = 1 - (exceedMaximum ? 0.5 / samples : 0);
- for (int i = 0; i < n; i++) {
- x[i] = x[i] * (x1 - x0) + x0;
- }
-
- this.exceedMinimum = exceedMinimum;
- this.exceedMaximum = exceedMaximum;
- }
-
- @Override
- public Double sample() {
- return sample(gen.nextDouble());
- }
-
- public double sample(double u) {
- if (exceedMinimum && u < x[0]) {
- // generate from left tail
- if (u == 0) {
- u = 1.0e-16;
- }
- return y[0] + Math.log(u / x[0]) * x[0] * (y[1] - y[0]) / (x[1] - x[0]);
- } else if (exceedMaximum && u > x[n - 1]) {
- if (u == 1) {
- u = 1 - 1.0e-16;
- }
- // generate from right tail
- double dy = y[n - 1] - y[n - 2];
- double dx = x[n - 1] - x[n - 2];
- return y[n - 1] - Math.log((1 - u) / (1 - x[n - 1])) * (1 - x[n - 1]) * dy / dx;
- } else {
- // linear interpolation
- for (int i = 1; i < n; i++) {
- if (x[i] > u) {
- double dy = y[i] - y[i - 1];
- double dx = x[i] - x[i - 1];
- return y[i - 1] + (u - x[i - 1]) * dy / dx;
- }
- }
- throw new RuntimeException(String.format("Can't happen (%.3f is not in [%.3f,%.3f]", u, x[0], x[n - 1]));
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/IndianBuffet.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/IndianBuffet.java b/math/src/main/java/org/apache/mahout/math/random/IndianBuffet.java
deleted file mode 100644
index 27b5d84..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/IndianBuffet.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import com.google.common.base.CharMatcher;
-import com.google.common.base.Charsets;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.io.LineProcessor;
-import com.google.common.io.Resources;
-import org.apache.mahout.common.RandomUtils;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Random;
-
-/**
- * Samples a "document" from an IndianBuffet process.
- *
- * See http://mlg.eng.cam.ac.uk/zoubin/talks/turin09.pdf for details
- */
-public final class IndianBuffet<T> implements Sampler<List<T>> {
- private final List<Integer> count = Lists.newArrayList();
- private int documents = 0;
- private final double alpha;
- private WordFunction<T> converter = null;
- private final Random gen;
-
- public IndianBuffet(double alpha, WordFunction<T> converter) {
- this.alpha = alpha;
- this.converter = converter;
- gen = RandomUtils.getRandom();
- }
-
- public static IndianBuffet<Integer> createIntegerDocumentSampler(double alpha) {
- return new IndianBuffet<>(alpha, new IdentityConverter());
- }
-
- public static IndianBuffet<String> createTextDocumentSampler(double alpha) {
- return new IndianBuffet<>(alpha, new WordConverter());
- }
-
- @Override
- public List<T> sample() {
- List<T> r = Lists.newArrayList();
- if (documents == 0) {
- double n = new PoissonSampler(alpha).sample();
- for (int i = 0; i < n; i++) {
- r.add(converter.convert(i));
- count.add(1);
- }
- documents++;
- } else {
- documents++;
- int i = 0;
- for (double cnt : count) {
- if (gen.nextDouble() < cnt / documents) {
- r.add(converter.convert(i));
- count.set(i, count.get(i) + 1);
- }
- i++;
- }
- int newItems = new PoissonSampler(alpha / documents).sample().intValue();
- for (int j = 0; j < newItems; j++) {
- r.add(converter.convert(i + j));
- count.add(1);
- }
- }
- return r;
- }
-
- private interface WordFunction<T> {
- T convert(int i);
- }
-
- /**
- * Just converts to an integer.
- */
- public static class IdentityConverter implements WordFunction<Integer> {
- @Override
- public Integer convert(int i) {
- return i;
- }
- }
-
- /**
- * Converts to a string.
- */
- public static class StringConverter implements WordFunction<String> {
- @Override
- public String convert(int i) {
- return String.valueOf(i);
- }
- }
-
- /**
- * Converts to one of a list of common English words for reasonably small integers and converts
- * to a token like w_92463 for big integers.
- */
- public static final class WordConverter implements WordFunction<String> {
- private final Splitter onSpace = Splitter.on(CharMatcher.WHITESPACE).omitEmptyStrings().trimResults();
- private final List<String> words;
-
- public WordConverter() {
- try {
- words = Resources.readLines(Resources.getResource("words.txt"), Charsets.UTF_8,
- new LineProcessor<List<String>>() {
- private final List<String> theWords = Lists.newArrayList();
-
- @Override
- public boolean processLine(String line) {
- Iterables.addAll(theWords, onSpace.split(line));
- return true;
- }
-
- @Override
- public List<String> getResult() {
- return theWords;
- }
- });
- } catch (IOException e) {
- throw new ImpossibleException(e);
- }
- }
-
- @Override
- public String convert(int i) {
- if (i < words.size()) {
- return words.get(i);
- } else {
- return "w_" + i;
- }
- }
- }
-
- public static class ImpossibleException extends RuntimeException {
- public ImpossibleException(Throwable e) {
- super(e);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/Missing.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/Missing.java b/math/src/main/java/org/apache/mahout/math/random/Missing.java
deleted file mode 100644
index 8141a71..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/Missing.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import java.util.Random;
-
-import org.apache.mahout.common.RandomUtils;
-
-/**
- * Models data with missing values. Note that all variables with the same fraction of missing
- * values will have the same sequence of missing values. Similarly, if two variables have
- * missing probabilities of p1 > p2, then all of the p2 missing values will also be missing for
- * p1.
- */
-public final class Missing<T> implements Sampler<T> {
- private final Random gen;
- private final double p;
- private final Sampler<T> delegate;
- private final T missingMarker;
-
- public Missing(int seed, double p, Sampler<T> delegate, T missingMarker) {
- this.p = p;
- this.delegate = delegate;
- this.missingMarker = missingMarker;
- gen = RandomUtils.getRandom(seed);
- }
-
- public Missing(double p, Sampler<T> delegate, T missingMarker) {
- this(1, p, delegate, missingMarker);
- }
-
- public Missing(double p, Sampler<T> delegate) {
- this(1, p, delegate, null);
- }
-
- @Override
- public T sample() {
- if (gen.nextDouble() >= p) {
- return delegate.sample();
- } else {
- return missingMarker;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/MultiNormal.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/MultiNormal.java b/math/src/main/java/org/apache/mahout/math/random/MultiNormal.java
deleted file mode 100644
index 748d4e8..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/MultiNormal.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.DiagonalMatrix;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.DoubleFunction;
-
-import java.util.Random;
-
-/**
- * Samples from a multi-variate normal distribution.
- * 
- * This is done by sampling from several independent unit normal distributions to get a vector u.
- * The sample value that is returned is then A u + m where A is derived from the covariance matrix
- * and m is the mean of the result.
- * 
- * If \Sigma is the desired covariance matrix, then you can use any value of A such that A' A =
- * \Sigma. The Cholesky decomposition can be used to compute A if \Sigma is positive definite.
- * Slightly more expensive is to use the SVD U S V' = \Sigma and then set A = U \sqrt{S}.
- *
- * Useful special cases occur when \Sigma is diagonal so that A = \sqrt(\Sigma) or where \Sigma = r I.
- *
- * Another special case is where m = 0.
- */
-public class MultiNormal implements Sampler<Vector> {
- private final Random gen;
- private final int dimension;
- private final Matrix scale;
- private final Vector mean;
-
- /**
- * Constructs a sampler with diagonal scale matrix.
- * @param diagonal The diagonal elements of the scale matrix.
- */
- public MultiNormal(Vector diagonal) {
- this(new DiagonalMatrix(diagonal), null);
- }
-
- /**
- * Constructs a sampler with diagonal scale matrix and (potentially)
- * non-zero mean.
- * @param diagonal The scale matrix's principal diagonal.
- * @param mean The desired mean. Set to null if zero mean is desired.
- */
- public MultiNormal(Vector diagonal, Vector mean) {
- this(new DiagonalMatrix(diagonal), mean);
- }
-
- /**
- * Constructs a sampler with non-trivial scale matrix and mean.
- */
- public MultiNormal(Matrix a, Vector mean) {
- this(a, mean, a.columnSize());
- }
-
- public MultiNormal(int dimension) {
- this(null, null, dimension);
- }
-
- public MultiNormal(double radius, Vector mean) {
- this(new DiagonalMatrix(radius, mean.size()), mean);
- }
-
- private MultiNormal(Matrix scale, Vector mean, int dimension) {
- gen = RandomUtils.getRandom();
- this.dimension = dimension;
- this.scale = scale;
- this.mean = mean;
- }
-
- @Override
- public Vector sample() {
- Vector v = new DenseVector(dimension).assign(
- new DoubleFunction() {
- @Override
- public double apply(double ignored) {
- return gen.nextGaussian();
- }
- }
- );
- if (mean != null) {
- if (scale != null) {
- return scale.times(v).plus(mean);
- } else {
- return v.plus(mean);
- }
- } else {
- if (scale != null) {
- return scale.times(v);
- } else {
- return v;
- }
- }
- }
-
- public Vector getScale() {
- return mean;
- }
-}

r***@apache.org

2018-06-27 14:51:30 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/Multinomial.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/Multinomial.java b/math/src/main/java/org/apache/mahout/math/random/Multinomial.java
deleted file mode 100644
index d79c32c..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/Multinomial.java
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Multiset;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.list.DoubleArrayList;
-
-/**
- * Multinomial sampler that allows updates to element probabilities. The basic idea is that sampling is
- * done by using a simple balanced tree. Probabilities are kept in the tree so that we can navigate to
- * any leaf in log N time. Updates are simple because we can just propagate them upwards.
- * 
- * In order to facilitate access by value, we maintain an additional map from value to tree node.
- */
-public final class Multinomial<T> implements Sampler<T>, Iterable<T> {
- // these lists use heap ordering. Thus, the root is at location 1, first level children at 2 and 3, second level
- // at 4, 5 and 6, 7.
- private final DoubleArrayList weight = new DoubleArrayList();
- private final List<T> values = Lists.newArrayList();
- private final Map<T, Integer> items = Maps.newHashMap();
- private Random rand = RandomUtils.getRandom();
-
- public Multinomial() {
- weight.add(0);
- values.add(null);
- }
-
- public Multinomial(Multiset<T> counts) {
- this();
- Preconditions.checkArgument(!counts.isEmpty(), "Need some data to build sampler");
- rand = RandomUtils.getRandom();
- for (T t : counts.elementSet()) {
- add(t, counts.count(t));
- }
- }
-
- public Multinomial(Iterable<WeightedThing<T>> things) {
- this();
- for (WeightedThing<T> thing : things) {
- add(thing.getValue(), thing.getWeight());
- }
- }
-
- public void add(T value, double w) {
- Preconditions.checkNotNull(value);
- Preconditions.checkArgument(!items.containsKey(value));
-
- int n = this.weight.size();
- if (n == 1) {
- weight.add(w);
- values.add(value);
- items.put(value, 1);
- } else {
- // parent comes down
- weight.add(weight.get(n / 2));
- values.add(values.get(n / 2));
- items.put(values.get(n / 2), n);
- n++;
-
- // new item goes in
- items.put(value, n);
- this.weight.add(w);
- values.add(value);
-
- // parents get incremented all the way to the root
- while (n > 1) {
- n /= 2;
- this.weight.set(n, this.weight.get(n) + w);
- }
- }
- }
-
- public double getWeight(T value) {
- if (items.containsKey(value)) {
- return weight.get(items.get(value));
- } else {
- return 0;
- }
- }
-
- public double getProbability(T value) {
- if (items.containsKey(value)) {
- return weight.get(items.get(value)) / weight.get(1);
- } else {
- return 0;
- }
- }
-
- public double getWeight() {
- if (weight.size() > 1) {
- return weight.get(1);
- } else {
- return 0;
- }
- }
-
- public void delete(T value) {
- set(value, 0);
- }
-
- public void set(T value, double newP) {
- Preconditions.checkArgument(items.containsKey(value));
- int n = items.get(value);
- if (newP <= 0) {
- // this makes the iterator not see such an element even though we leave a phantom in the tree
- // Leaving the phantom behind simplifies tree maintenance and testing, but isn't really necessary.
- items.remove(value);
- }
- double oldP = weight.get(n);
- while (n > 0) {
- weight.set(n, weight.get(n) - oldP + newP);
- n /= 2;
- }
- }
-
- @Override
- public T sample() {
- Preconditions.checkArgument(!weight.isEmpty());
- return sample(rand.nextDouble());
- }
-
- public T sample(double u) {
- u *= weight.get(1);
-
- int n = 1;
- while (2 * n < weight.size()) {
- // children are at 2n and 2n+1
- double left = weight.get(2 * n);
- if (u <= left) {
- n = 2 * n;
- } else {
- u -= left;
- n = 2 * n + 1;
- }
- }
- return values.get(n);
- }
-
- /**
- * Exposed for testing only. Returns a list of the leaf weights. These are in an
- * order such that probing just before and after the cumulative sum of these weights
- * will touch every element of the tree twice and thus will make it possible to test
- * every possible left/right decision in navigating the tree.
- */
- List<Double> getWeights() {
- List<Double> r = Lists.newArrayList();
- int i = Integer.highestOneBit(weight.size());
- while (i < weight.size()) {
- r.add(weight.get(i));
- i++;
- }
- i /= 2;
- while (i < Integer.highestOneBit(weight.size())) {
- r.add(weight.get(i));
- i++;
- }
- return r;
- }
-
- @Override
- public Iterator<T> iterator() {
- return new AbstractIterator<T>() {
- Iterator<T> valuesIterator = Iterables.skip(values, 1).iterator();
- @Override
- protected T computeNext() {
- while (valuesIterator.hasNext()) {
- T next = valuesIterator.next();
- if (items.containsKey(next)) {
- return next;
- }
- }
- return endOfData();
- }
- };
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/Normal.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/Normal.java b/math/src/main/java/org/apache/mahout/math/random/Normal.java
deleted file mode 100644
index c162f26..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/Normal.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import org.apache.mahout.common.RandomUtils;
-
-import java.util.Random;
-
-public final class Normal extends AbstractSamplerFunction {
- private final Random rand = RandomUtils.getRandom();
- private double mean = 0;
- private double sd = 1;
-
- public Normal() {}
-
- public Normal(double mean, double sd) {
- this.mean = mean;
- this.sd = sd;
- }
-
- @Override
- public Double sample() {
- return rand.nextGaussian() * sd + mean;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/PoissonSampler.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/PoissonSampler.java b/math/src/main/java/org/apache/mahout/math/random/PoissonSampler.java
deleted file mode 100644
index e4e49f8..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/PoissonSampler.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import com.google.common.collect.Lists;
-import org.apache.commons.math3.distribution.PoissonDistribution;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.RandomWrapper;
-
-import java.util.List;
-
-/**
- * Samples from a Poisson distribution. Should probably not be used for lambda > 1000 or so.
- */
-public final class PoissonSampler extends AbstractSamplerFunction {
-
- private double limit;
- private Multinomial<Integer> partial;
- private final RandomWrapper gen;
- private final PoissonDistribution pd;
-
- public PoissonSampler(double lambda) {
- limit = 1;
- gen = RandomUtils.getRandom();
- pd = new PoissonDistribution(gen.getRandomGenerator(),
- lambda,
- PoissonDistribution.DEFAULT_EPSILON,
- PoissonDistribution.DEFAULT_MAX_ITERATIONS);
- }
-
- @Override
- public Double sample() {
- return sample(gen.nextDouble());
- }
-
- double sample(double u) {
- if (u < limit) {
- List<WeightedThing<Integer>> steps = Lists.newArrayList();
- limit = 1;
- int i = 0;
- while (u / 20 < limit) {
- double pdf = pd.probability(i);
- limit -= pdf;
- steps.add(new WeightedThing<>(i, pdf));
- i++;
- }
- steps.add(new WeightedThing<>(steps.size(), limit));
- partial = new Multinomial<>(steps);
- }
- return partial.sample(u);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/Sampler.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/Sampler.java b/math/src/main/java/org/apache/mahout/math/random/Sampler.java
deleted file mode 100644
index 51460fa..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/Sampler.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-/**
- * Samples from a generic type.
- */
-public interface Sampler<T> {
- T sample();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/random/WeightedThing.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/random/WeightedThing.java b/math/src/main/java/org/apache/mahout/math/random/WeightedThing.java
deleted file mode 100644
index 20f6df3..0000000
--- a/math/src/main/java/org/apache/mahout/math/random/WeightedThing.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.random;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.common.RandomUtils;
-
-/**
- * Handy for creating multinomial distributions of things.
- */
-public final class WeightedThing<T> implements Comparable<WeightedThing<T>> {
- private double weight;
- private final T value;
-
- public WeightedThing(T thing, double weight) {
- this.value = Preconditions.checkNotNull(thing);
- this.weight = weight;
- }
-
- public WeightedThing(double weight) {
- this.value = null;
- this.weight = weight;
- }
-
- public T getValue() {
- return value;
- }
-
- public double getWeight() {
- return weight;
- }
-
- public void setWeight(double weight) {
- this.weight = weight;
- }
-
- @Override
- public int compareTo(WeightedThing<T> other) {
- return Double.compare(this.weight, other.weight);
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof WeightedThing) {
- @SuppressWarnings("unchecked")
- WeightedThing<T> other = (WeightedThing<T>) o;
- return weight == other.weight && value.equals(other.value);
- }
- return false;
- }
-
- @Override
- public int hashCode() {
- return 31 * RandomUtils.hashDouble(weight) + value.hashCode();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java b/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java
deleted file mode 100644
index 7691420..0000000
--- a/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java
+++ /dev/null
@@ -1,188 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.set;
-
-import org.apache.mahout.math.PersistentObject;
-import org.apache.mahout.math.map.PrimeFinder;
-
-public abstract class AbstractSet extends PersistentObject {
- //public static boolean debug = false; // debug only
-
- /** The number of distinct associations in the map; its "size()". */
- protected int distinct;
-
- /**
- * The table capacity c=table.length always satisfies the invariant <tt>c * minLoadFactor <= s <= c *
- * maxLoadFactor</tt>, where s=size() is the number of associations currently contained. The term "c * minLoadFactor"
- * is called the "lowWaterMark", "c * maxLoadFactor" is called the "highWaterMark". In other words, the table capacity
- * (and proportionally the memory used by this class) oscillates within these constraints. The terms are precomputed
- * and cached to avoid recalculating them each time put(..) or removeKey(...) is called.
- */
- protected int lowWaterMark;
- protected int highWaterMark;
-
- /** The minimum load factor for the hashtable. */
- protected double minLoadFactor;
-
- /** The maximum load factor for the hashtable. */
- protected double maxLoadFactor;
-
- // these are public access for unit tests.
- public static final int DEFAULT_CAPACITY = 277;
- public static final double DEFAULT_MIN_LOAD_FACTOR = 0.2;
- public static final double DEFAULT_MAX_LOAD_FACTOR = 0.5;
-
- /**
- * Chooses a new prime table capacity optimized for growing that (approximately) satisfies the invariant <tt>c *
- * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given size.
- */
- protected int chooseGrowCapacity(int size, double minLoad, double maxLoad) {
- return nextPrime(Math.max(size + 1, (int) ((4 * size / (3 * minLoad + maxLoad)))));
- }
-
- /**
- * Returns new high water mark threshold based on current capacity and maxLoadFactor.
- *
- * @return int the new threshold.
- */
- protected int chooseHighWaterMark(int capacity, double maxLoad) {
- return Math.min(capacity - 2, (int) (capacity * maxLoad)); //makes sure there is always at least one FREE slot
- }
-
- /**
- * Returns new low water mark threshold based on current capacity and minLoadFactor.
- *
- * @return int the new threshold.
- */
- protected int chooseLowWaterMark(int capacity, double minLoad) {
- return (int) (capacity * minLoad);
- }
-
- /**
- * Chooses a new prime table capacity neither favoring shrinking nor growing, that (approximately) satisfies the
- * invariant <tt>c * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given
- * size.
- */
- protected int chooseMeanCapacity(int size, double minLoad, double maxLoad) {
- return nextPrime(Math.max(size + 1, (int) ((2 * size / (minLoad + maxLoad)))));
- }
-
- /**
- * Chooses a new prime table capacity optimized for shrinking that (approximately) satisfies the invariant <tt>c *
- * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given size.
- */
- protected int chooseShrinkCapacity(int size, double minLoad, double maxLoad) {
- return nextPrime(Math.max(size + 1, (int) ((4 * size / (minLoad + 3 * maxLoad)))));
- }
-
- /** Removes all (key,value) associations from the receiver. */
- public abstract void clear();
-
- /**
- * Ensures that the receiver can hold at least the specified number of elements without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
- * method never need be called; it is for performance tuning only. Calling this method before <tt>put()</tt>ing a
- * large number of associations boosts performance, because the receiver will grow only once instead of potentially
- * many times. This default implementation does nothing. Override this method if necessary.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public void ensureCapacity(int minCapacity) {
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains no (key,value) associations.
- *
- * @return <tt>true</tt> if the receiver contains no (key,value) associations.
- */
- public boolean isEmpty() {
- return distinct == 0;
- }
-
- /**
- * Returns a prime number which is <code>>= desiredCapacity</code> and very close to <code>desiredCapacity</code>
- * (within 11% if <code>desiredCapacity >= 1000</code>).
- *
- * @param desiredCapacity the capacity desired by the user.
- * @return the capacity which should be used for a hashtable.
- */
- protected int nextPrime(int desiredCapacity) {
- return PrimeFinder.nextPrime(desiredCapacity);
- }
-
- /**
- * Initializes the receiver. You will almost certainly need to override this method in subclasses to initialize the
- * hash table.
- *
- * @param initialCapacity the initial capacity of the receiver.
- * @param minLoadFactor the minLoadFactor of the receiver.
- * @param maxLoadFactor the maxLoadFactor of the receiver.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- if (initialCapacity < 0) {
- throw new IllegalArgumentException("Initial Capacity must not be less than zero: " + initialCapacity);
- }
- if (minLoadFactor < 0.0 || minLoadFactor >= 1.0) {
- throw new IllegalArgumentException("Illegal minLoadFactor: " + minLoadFactor);
- }
- if (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) {
- throw new IllegalArgumentException("Illegal maxLoadFactor: " + maxLoadFactor);
- }
- if (minLoadFactor >= maxLoadFactor) {
- throw new IllegalArgumentException(
- "Illegal minLoadFactor: " + minLoadFactor + " and maxLoadFactor: " + maxLoadFactor);
- }
- }
-
- /**
- * Returns the number of (key,value) associations currently contained.
- *
- * @return the number of (key,value) associations currently contained.
- */
- public int size() {
- return distinct;
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver. This default implementation does
- * nothing. Override this method if necessary.
- */
- public void trimToSize() {
- }
-
- protected static boolean equalsMindTheNull(Object a, Object b) {
- if (a == null && b == null) {
- return true;
- }
- if (a == null || b == null) {
- return false;
- }
- return a.equals(b);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/set/HashUtils.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/set/HashUtils.java b/math/src/main/java/org/apache/mahout/math/set/HashUtils.java
deleted file mode 100644
index f5dfeb0..0000000
--- a/math/src/main/java/org/apache/mahout/math/set/HashUtils.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.set;
-
-/**
- * Computes hashes of primitive values. Providing these as statics allows the templated code
- * to compute hashes of sets.
- */
-public final class HashUtils {
-
- private HashUtils() {
- }
-
- public static int hash(byte x) {
- return x;
- }
-
- public static int hash(short x) {
- return x;
- }
-
- public static int hash(char x) {
- return x;
- }
-
- public static int hash(int x) {
- return x;
- }
-
- public static int hash(float x) {
- return Float.floatToIntBits(x) >>> 3 + Float.floatToIntBits((float) (Math.PI * x));
- }
-
- public static int hash(double x) {
- return hash(17 * Double.doubleToLongBits(x));
- }
-
- public static int hash(long x) {
- return (int) ((x * 11) >>> 32 ^ x);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java b/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java
deleted file mode 100644
index 285b5a5..0000000
--- a/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java
+++ /dev/null
@@ -1,548 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.set;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.mahout.math.MurmurHash;
-import org.apache.mahout.math.function.ObjectProcedure;
-import org.apache.mahout.math.map.PrimeFinder;
-
-/**
- * Open hashing alternative to java.util.HashSet.
- **/
-public class OpenHashSet<T> extends AbstractSet implements Set<T> {
- protected static final byte FREE = 0;
- protected static final byte FULL = 1;
- protected static final byte REMOVED = 2;
- protected static final char NO_KEY_VALUE = 0;
-
- /** The hash table keys. */
- private Object[] table;
-
- /** The state of each hash table entry (FREE, FULL, REMOVED). */
- private byte[] state;
-
- /** The number of table entries in state==FREE. */
- private int freeEntries;
-
-
- /** Constructs an empty map with default capacity and default load factors. */
- public OpenHashSet() {
- this(DEFAULT_CAPACITY);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and default load factors.
- *
- * @param initialCapacity the initial capacity of the map.
- * @throws IllegalArgumentException if the initial capacity is less than zero.
- */
- public OpenHashSet(int initialCapacity) {
- this(initialCapacity, DEFAULT_MIN_LOAD_FACTOR, DEFAULT_MAX_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
- *
- * @param initialCapacity the initial capacity.
- * @param minLoadFactor the minimum load factor.
- * @param maxLoadFactor the maximum load factor.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- public OpenHashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- setUp(initialCapacity, minLoadFactor, maxLoadFactor);
- }
-
- /** Removes all values associations from the receiver. Implicitly calls <tt>trimToSize()</tt>. */
- @Override
- public void clear() {
- Arrays.fill(this.state, 0, state.length - 1, FREE);
- distinct = 0;
- freeEntries = table.length; // delta
- trimToSize();
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @SuppressWarnings("unchecked")
- @Override
- public Object clone() {
- OpenHashSet<T> copy = (OpenHashSet<T>) super.clone();
- copy.table = copy.table.clone();
- copy.state = copy.state.clone();
- return copy;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- @Override
- @SuppressWarnings("unchecked")
- public boolean contains(Object key) {
- return indexOfKey((T)key) >= 0;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
- * method never need be called; it is for performance tuning only. Calling this method before <tt>add()</tt>ing a
- * large number of associations boosts performance, because the receiver will grow only once instead of potentially
- * many times and hash collisions get less probable.
- *
- * @param minCapacity the desired minimum capacity.
- */
- @Override
- public void ensureCapacity(int minCapacity) {
- if (table.length < minCapacity) {
- int newCapacity = nextPrime(minCapacity);
- rehash(newCapacity);
- }
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @SuppressWarnings("unchecked")
- public boolean forEachKey(ObjectProcedure<T> procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL) {
- if (!procedure.apply((T)table[i])) {
- return false;
- }
- }
- }
- return true;
- }
-
- /**
- * @param key the key to be added to the receiver.
- * @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
- * key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
- * at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
- * slot index.
- */
- protected int indexOfInsertion(T key) {
- Object[] tab = table;
- byte[] stat = state;
- int length = tab.length;
-
- int hash = key.hashCode() & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a removed or free slot, or if we find the key itself
- // do NOT skip over removed slots (yes, open addressing is like that...)
- while (stat[i] == FULL && tab[i] != key) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (stat[i] == REMOVED) {
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- // assertion: there is at least one FREE slot.
- int j = i;
- while (stat[i] != FREE && (stat[i] == REMOVED || tab[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
- if (stat[i] == FREE) {
- i = j;
- }
- }
-
-
- if (stat[i] == FULL) {
- // key already contained at slot i.
- // return a negative number identifying the slot.
- return -i - 1;
- }
- // not already contained, should be inserted at slot i.
- // return a number >= 0 identifying the slot.
- return i;
- }
-
- /**
- * @param key the key to be searched in the receiver.
- * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
- */
- protected int indexOfKey(T key) {
- Object[] tab = table;
- byte[] stat = state;
- int length = tab.length;
-
- int hash = key.hashCode() & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- while (stat[i] != FREE && (stat[i] == REMOVED || (!key.equals(tab[i])))) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (stat[i] == FREE) {
- return -1;
- } // not found
- return i; //found, return index where key is contained
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * This method can be used
- * to iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @SuppressWarnings("unchecked")
- public void keys(List<T> list) {
- list.clear();
-
-
- Object [] tab = table;
- byte[] stat = state;
-
- for (int i = tab.length; i-- > 0;) {
- if (stat[i] == FULL) {
- list.add((T)tab[i]);
- }
- }
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public boolean add(Object key) {
- int i = indexOfInsertion((T)key);
- if (i < 0) { //already contained
- return false;
- }
-
- if (this.distinct > this.highWaterMark) {
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return add(key);
- }
-
- this.table[i] = key;
- if (this.state[i] == FREE) {
- this.freeEntries--;
- }
- this.state[i] = FULL;
- this.distinct++;
-
- if (this.freeEntries < 1) { //delta
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return add(key);
- }
-
- return true;
- }
-
- /**
- * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
- * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
- * mark.
- */
- @SuppressWarnings("unchecked")
- protected void rehash(int newCapacity) {
- int oldCapacity = table.length;
- //if (oldCapacity == newCapacity) return;
-
- Object[] oldTable = table;
- byte[] oldState = state;
-
- Object[] newTable = new Object[newCapacity];
- byte[] newState = new byte[newCapacity];
-
- this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
- this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
-
- this.table = newTable;
- this.state = newState;
- this.freeEntries = newCapacity - this.distinct; // delta
-
- for (int i = oldCapacity; i-- > 0;) {
- if (oldState[i] == FULL) {
- Object element = oldTable[i];
- int index = indexOfInsertion((T)element);
- newTable[index] = element;
- newState[index] = FULL;
- }
- }
- }
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- @SuppressWarnings("unchecked")
- @Override
- public boolean remove(Object key) {
- int i = indexOfKey((T)key);
- if (i < 0) {
- return false;
- } // key not contained
-
- this.state[i] = REMOVED;
- this.distinct--;
-
- if (this.distinct < this.lowWaterMark) {
- int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Initializes the receiver.
- *
- * @param initialCapacity the initial capacity of the receiver.
- * @param minLoadFactor the minLoadFactor of the receiver.
- * @param maxLoadFactor the maxLoadFactor of the receiver.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- @Override
- protected final void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- int capacity = initialCapacity;
- super.setUp(capacity, minLoadFactor, maxLoadFactor);
- capacity = nextPrime(capacity);
- if (capacity == 0) {
- capacity = 1;
- } // open addressing needs at least one FREE slot at any time.
-
- this.table = new Object[capacity];
- this.state = new byte[capacity];
-
- // memory will be exhausted long before this pathological case happens, anyway.
- this.minLoadFactor = minLoadFactor;
- if (capacity == PrimeFinder.LARGEST_PRIME) {
- this.maxLoadFactor = 1.0;
- } else {
- this.maxLoadFactor = maxLoadFactor;
- }
-
- this.distinct = 0;
- this.freeEntries = capacity; // delta
-
- // lowWaterMark will be established upon first expansion.
- // establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
- // After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
- // See ensureCapacity(...)
- this.lowWaterMark = 0;
- this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- // * 1.2 because open addressing's performance exponentially degrades beyond that point
- // so that even rehashing the table can take very long
- int newCapacity = nextPrime((int) (1 + 1.2 * size()));
- if (table.length > newCapacity) {
- rehash(newCapacity);
- }
- }
-
- /**
- * Access for unit tests.
- * @param capacity
- * @param minLoadFactor
- * @param maxLoadFactor
- */
- void getInternalFactors(int[] capacity,
- double[] minLoadFactor,
- double[] maxLoadFactor) {
- capacity[0] = table.length;
- minLoadFactor[0] = this.minLoadFactor;
- maxLoadFactor[0] = this.maxLoadFactor;
- }
-
- @Override
- public boolean isEmpty() {
- return size() == 0;
- }
-
- /**
- * OpenHashSet instances are only equal to other OpenHashSet instances, not to
- * any other collection. Hypothetically, we should check for and permit
- * equals on other Sets.
- */
- @Override
- @SuppressWarnings("unchecked")
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof OpenHashSet)) {
- return false;
- }
- final OpenHashSet<T> other = (OpenHashSet<T>) obj;
- if (other.size() != size()) {
- return false;
- }
-
- return forEachKey(new ObjectProcedure<T>() {
- @Override
- public boolean apply(T key) {
- return other.contains(key);
- }
- });
- }
-
- @Override
- public int hashCode() {
- ByteBuffer buf = ByteBuffer.allocate(size());
- for (int i = 0; i < table.length; i++) {
- Object v = table[i];
- if (state[i] == FULL) {
- buf.putInt(v.hashCode());
- }
- }
- return MurmurHash.hash(buf, this.getClass().getName().hashCode());
- }
-
- /**
- * Implement the standard Java Collections iterator. Note that 'remove' is silently
- * ineffectual here. This method is provided for convenience, only.
- */
- @Override
- public Iterator<T> iterator() {
- List<T> keyList = new ArrayList<>();
- keys(keyList);
- return keyList.iterator();
- }
-
- @Override
- public Object[] toArray() {
- List<T> keyList = new ArrayList<>();
- keys(keyList);
- return keyList.toArray();
- }
-
- @Override
- public boolean addAll(Collection<? extends T> c) {
- boolean anyAdded = false;
- for (T o : c) {
- boolean added = add(o);
- anyAdded |= added;
- }
- return anyAdded;
- }
-
- @Override
- public boolean containsAll(Collection<?> c) {
- for (Object o : c) {
- if (!contains(o)) {
- return false;
- }
- }
- return true;
- }
-
- @Override
- public boolean removeAll(Collection<?> c) {
- boolean anyRemoved = false;
- for (Object o : c) {
- boolean removed = remove(o);
- anyRemoved |= removed;
- }
- return anyRemoved;
- }
-
- @Override
- public boolean retainAll(Collection<?> c) {
- final Collection<?> finalCollection = c;
- final boolean[] modified = new boolean[1];
- modified[0] = false;
- forEachKey(new ObjectProcedure<T>() {
- @Override
- public boolean apply(T element) {
- if (!finalCollection.contains(element)) {
- remove(element);
- modified[0] = true;
- }
- return true;
- }
- });
- return modified[0];
- }
-
- @Override
- public <T1> T1[] toArray(T1[] a) {
- return keys().toArray(a);
- }
-
- public List<T> keys() {
- List<T> keys = new ArrayList<>();
- keys(keys);
- return keys;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/solver/ConjugateGradientSolver.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/solver/ConjugateGradientSolver.java b/math/src/main/java/org/apache/mahout/math/solver/ConjugateGradientSolver.java
deleted file mode 100644
index 02bde9b..0000000
--- a/math/src/main/java/org/apache/mahout/math/solver/ConjugateGradientSolver.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.solver;
-
-import org.apache.mahout.math.CardinalityException;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorIterable;
-import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.math.function.PlusMult;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Implementation of a conjugate gradient iterative solver for linear systems. Implements both
- * standard conjugate gradient and pre-conditioned conjugate gradient.
- *
- * Conjugate gradient requires the matrix A in the linear system Ax = b to be symmetric and positive
- * definite. For convenience, this implementation could be extended relatively easily to handle the
- * case where the input matrix to be be non-symmetric, in which case the system A'Ax = b would be solved.
- * Because this requires only one pass through the matrix A, it is faster than explicitly computing A'A,
- * then passing the results to the solver.
- *
- * For inputs that may be ill conditioned (often the case for highly sparse input), this solver
- * also accepts a parameter, lambda, which adds a scaled identity to the matrix A, solving the system
- * (A + lambda*I)x = b. This obviously changes the solution, but it will guarantee solvability. The
- * ridge regression approach to linear regression is a common use of this feature.
- *
- * If only an approximate solution is required, the maximum number of iterations or the error threshold
- * may be specified to end the algorithm early at the expense of accuracy. When the matrix A is ill conditioned,
- * it may sometimes be necessary to increase the maximum number of iterations above the default of A.numCols()
- * due to numerical issues.
- *
- * By default the solver will run a.numCols() iterations or until the residual falls below 1E-9.
- *
- * For more information on the conjugate gradient algorithm, see Golub & van Loan, "Matrix Computations",
- * sections 10.2 and 10.3 or the <a href="http://en.wikipedia.org/wiki/Conjugate_gradient">conjugate gradient
- * wikipedia article</a>.
- */
-
-public class ConjugateGradientSolver {
-
- public static final double DEFAULT_MAX_ERROR = 1.0e-9;
-
- private static final Logger log = LoggerFactory.getLogger(ConjugateGradientSolver.class);
- private static final PlusMult PLUS_MULT = new PlusMult(1.0);
-
- private int iterations;
- private double residualNormSquared;
-
- public ConjugateGradientSolver() {
- this.iterations = 0;
- this.residualNormSquared = Double.NaN;
- }
-
- /**
- * Solves the system Ax = b with default termination criteria. A must be symmetric, square, and positive definite.
- * Only the squareness of a is checked, since testing for symmetry and positive definiteness are too expensive. If
- * an invalid matrix is specified, then the algorithm may not yield a valid result.
- *
- * @param a The linear operator A.
- * @param b The vector b.
- * @return The result x of solving the system.
- * @throws IllegalArgumentException if a is not square or if the size of b is not equal to the number of columns of a.
- *
- */
- public Vector solve(VectorIterable a, Vector b) {
- return solve(a, b, null, b.size() + 2, DEFAULT_MAX_ERROR);
- }
-
- /**
- * Solves the system Ax = b with default termination criteria using the specified preconditioner. A must be
- * symmetric, square, and positive definite. Only the squareness of a is checked, since testing for symmetry
- * and positive definiteness are too expensive. If an invalid matrix is specified, then the algorithm may not
- * yield a valid result.
- *
- * @param a The linear operator A.
- * @param b The vector b.
- * @param precond A preconditioner to use on A during the solution process.
- * @return The result x of solving the system.
- * @throws IllegalArgumentException if a is not square or if the size of b is not equal to the number of columns of a.
- *
- */
- public Vector solve(VectorIterable a, Vector b, Preconditioner precond) {
- return solve(a, b, precond, b.size() + 2, DEFAULT_MAX_ERROR);
- }
-
-
- /**
- * Solves the system Ax = b, where A is a linear operator and b is a vector. Uses the specified preconditioner
- * to improve numeric stability and possibly speed convergence. This version of solve() allows control over the
- * termination and iteration parameters.
- *
- * @param a The matrix A.
- * @param b The vector b.
- * @param preconditioner The preconditioner to apply.
- * @param maxIterations The maximum number of iterations to run.
- * @param maxError The maximum amount of residual error to tolerate. The algorithm will run until the residual falls
- * below this value or until maxIterations are completed.
- * @return The result x of solving the system.
- * @throws IllegalArgumentException if the matrix is not square, if the size of b is not equal to the number of
- * columns of A, if maxError is less than zero, or if maxIterations is not positive.
- */
-
- public Vector solve(VectorIterable a,
- Vector b,
- Preconditioner preconditioner,
- int maxIterations,
- double maxError) {
-
- if (a.numRows() != a.numCols()) {
- throw new IllegalArgumentException("Matrix must be square, symmetric and positive definite.");
- }
-
- if (a.numCols() != b.size()) {
- throw new CardinalityException(a.numCols(), b.size());
- }
-
- if (maxIterations <= 0) {
- throw new IllegalArgumentException("Max iterations must be positive.");
- }
-
- if (maxError < 0.0) {
- throw new IllegalArgumentException("Max error must be non-negative.");
- }
-
- Vector x = new DenseVector(b.size());
-
- iterations = 0;
- Vector residual = b.minus(a.times(x));
- residualNormSquared = residual.dot(residual);
-
- log.info("Conjugate gradient initial residual norm = {}", Math.sqrt(residualNormSquared));
- double previousConditionedNormSqr = 0.0;
- Vector updateDirection = null;
- while (Math.sqrt(residualNormSquared) > maxError && iterations < maxIterations) {
- Vector conditionedResidual;
- double conditionedNormSqr;
- if (preconditioner == null) {
- conditionedResidual = residual;
- conditionedNormSqr = residualNormSquared;
- } else {
- conditionedResidual = preconditioner.precondition(residual);
- conditionedNormSqr = residual.dot(conditionedResidual);
- }
-
- ++iterations;
-
- if (iterations == 1) {
- updateDirection = new DenseVector(conditionedResidual);
- } else {
- double beta = conditionedNormSqr / previousConditionedNormSqr;
-
- // updateDirection = residual + beta * updateDirection
- updateDirection.assign(Functions.MULT, beta);
- updateDirection.assign(conditionedResidual, Functions.PLUS);
- }
-
- Vector aTimesUpdate = a.times(updateDirection);
-
- double alpha = conditionedNormSqr / updateDirection.dot(aTimesUpdate);
-
- // x = x + alpha * updateDirection
- PLUS_MULT.setMultiplicator(alpha);
- x.assign(updateDirection, PLUS_MULT);
-
- // residual = residual - alpha * A * updateDirection
- PLUS_MULT.setMultiplicator(-alpha);
- residual.assign(aTimesUpdate, PLUS_MULT);
-
- previousConditionedNormSqr = conditionedNormSqr;
- residualNormSquared = residual.dot(residual);
-
- log.info("Conjugate gradient iteration {} residual norm = {}", iterations, Math.sqrt(residualNormSquared));
- }
- return x;
- }
-
- /**
- * Returns the number of iterations run once the solver is complete.
- *
- * @return The number of iterations run.
- */
- public int getIterations() {
- return iterations;
- }
-
- /**
- * Returns the norm of the residual at the completion of the solver. Usually this should be close to zero except in
- * the case of a non positive definite matrix A, which results in an unsolvable system, or for ill conditioned A, in
- * which case more iterations than the default may be needed.
- *
- * @return The norm of the residual in the solution.
- */
- public double getResidualNorm() {
- return Math.sqrt(residualNormSquared);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java b/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java
deleted file mode 100644
index 871ba44..0000000
--- a/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java
+++ /dev/null
@@ -1,892 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Adapted from the public domain Jama code.
- */
-
-package org.apache.mahout.math.solver;
-
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.Functions;
-
-/**
- * Eigenvalues and eigenvectors of a real matrix.
- * 
- * If A is symmetric, then A = V*D*V' where the eigenvalue matrix D is diagonal and the eigenvector
- * matrix V is orthogonal. I.e. A = V.times(D.times(V.transpose())) and V.times(V.transpose())
- * equals the identity matrix.
- * 
- * If A is not symmetric, then the eigenvalue matrix D is block diagonal with the real eigenvalues
- * in 1-by-1 blocks and any complex eigenvalues, lambda + i*mu, in 2-by-2 blocks, [lambda, mu; -mu,
- * lambda]. The columns of V represent the eigenvectors in the sense that A*V = V*D, i.e.
- * A.times(V) equals V.times(D). The matrix V may be badly conditioned, or even singular, so the
- * validity of the equation A = V*D*inverse(V) depends upon V.cond().
- */
-public class EigenDecomposition {
-
- /** Row and column dimension (square matrix). */
- private final int n;
- /** Arrays for internal storage of eigenvalues. */
- private final Vector d;
- private final Vector e;
- /** Array for internal storage of eigenvectors. */
- private final Matrix v;
-
- public EigenDecomposition(Matrix x) {
- this(x, isSymmetric(x));
- }
-
- public EigenDecomposition(Matrix x, boolean isSymmetric) {
- n = x.columnSize();
- d = new DenseVector(n);
- e = new DenseVector(n);
- v = new DenseMatrix(n, n);
-
- if (isSymmetric) {
- v.assign(x);
-
- // Tridiagonalize.
- tred2();
-
- // Diagonalize.
- tql2();
-
- } else {
- // Reduce to Hessenberg form.
- // Reduce Hessenberg to real Schur form.
- hqr2(orthes(x));
- }
- }
-
- /**
- * Return the eigenvector matrix
- *
- * @return V
- */
- public Matrix getV() {
- return v.like().assign(v);
- }
-
- /**
- * Return the real parts of the eigenvalues
- */
- public Vector getRealEigenvalues() {
- return d;
- }
-
- /**
- * Return the imaginary parts of the eigenvalues
- */
- public Vector getImagEigenvalues() {
- return e;
- }
-
- /**
- * Return the block diagonal eigenvalue matrix
- *
- * @return D
- */
- public Matrix getD() {
- Matrix x = new DenseMatrix(n, n);
- x.assign(0);
- x.viewDiagonal().assign(d);
- for (int i = 0; i < n; i++) {
- double v = e.getQuick(i);
- if (v > 0) {
- x.setQuick(i, i + 1, v);
- } else if (v < 0) {
- x.setQuick(i, i - 1, v);
- }
- }
- return x;
- }
-
- // Symmetric Householder reduction to tridiagonal form.
- private void tred2() {
- // This is derived from the Algol procedures tred2 by
- // Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
- // Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
- // Fortran subroutine in EISPACK.
-
- d.assign(v.viewColumn(n - 1));
-
- // Householder reduction to tridiagonal form.
-
- for (int i = n - 1; i > 0; i--) {
-
- // Scale to avoid under/overflow.
-
- double scale = d.viewPart(0, i).norm(1);
- double h = 0.0;
-
-
- if (scale == 0.0) {
- e.setQuick(i, d.getQuick(i - 1));
- for (int j = 0; j < i; j++) {
- d.setQuick(j, v.getQuick(i - 1, j));
- v.setQuick(i, j, 0.0);
- v.setQuick(j, i, 0.0);
- }
- } else {
-
- // Generate Householder vector.
-
- for (int k = 0; k < i; k++) {
- d.setQuick(k, d.getQuick(k) / scale);
- h += d.getQuick(k) * d.getQuick(k);
- }
- double f = d.getQuick(i - 1);
- double g = Math.sqrt(h);
- if (f > 0) {
- g = -g;
- }
- e.setQuick(i, scale * g);
- h -= f * g;
- d.setQuick(i - 1, f - g);
- for (int j = 0; j < i; j++) {
- e.setQuick(j, 0.0);
- }
-
- // Apply similarity transformation to remaining columns.
-
- for (int j = 0; j < i; j++) {
- f = d.getQuick(j);
- v.setQuick(j, i, f);
- g = e.getQuick(j) + v.getQuick(j, j) * f;
- for (int k = j + 1; k <= i - 1; k++) {
- g += v.getQuick(k, j) * d.getQuick(k);
- e.setQuick(k, e.getQuick(k) + v.getQuick(k, j) * f);
- }
- e.setQuick(j, g);
- }
- f = 0.0;
- for (int j = 0; j < i; j++) {
- e.setQuick(j, e.getQuick(j) / h);
- f += e.getQuick(j) * d.getQuick(j);
- }
- double hh = f / (h + h);
- for (int j = 0; j < i; j++) {
- e.setQuick(j, e.getQuick(j) - hh * d.getQuick(j));
- }
- for (int j = 0; j < i; j++) {
- f = d.getQuick(j);
- g = e.getQuick(j);
- for (int k = j; k <= i - 1; k++) {
- v.setQuick(k, j, v.getQuick(k, j) - (f * e.getQuick(k) + g * d.getQuick(k)));
- }
- d.setQuick(j, v.getQuick(i - 1, j));
- v.setQuick(i, j, 0.0);
- }
- }
- d.setQuick(i, h);
- }
-
- // Accumulate transformations.
-
- for (int i = 0; i < n - 1; i++) {
- v.setQuick(n - 1, i, v.getQuick(i, i));
- v.setQuick(i, i, 1.0);
- double h = d.getQuick(i + 1);
- if (h != 0.0) {
- for (int k = 0; k <= i; k++) {
- d.setQuick(k, v.getQuick(k, i + 1) / h);
- }
- for (int j = 0; j <= i; j++) {
- double g = 0.0;
- for (int k = 0; k <= i; k++) {
- g += v.getQuick(k, i + 1) * v.getQuick(k, j);
- }
- for (int k = 0; k <= i; k++) {
- v.setQuick(k, j, v.getQuick(k, j) - g * d.getQuick(k));
- }
- }
- }
- for (int k = 0; k <= i; k++) {
- v.setQuick(k, i + 1, 0.0);
- }
- }
- d.assign(v.viewRow(n - 1));
- v.viewRow(n - 1).assign(0);
- v.setQuick(n - 1, n - 1, 1.0);
- e.setQuick(0, 0.0);
- }
-
- // Symmetric tridiagonal QL algorithm.
- private void tql2() {
-
- // This is derived from the Algol procedures tql2, by
- // Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
- // Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
- // Fortran subroutine in EISPACK.
-
- e.viewPart(0, n - 1).assign(e.viewPart(1, n - 1));
- e.setQuick(n - 1, 0.0);
-
- double f = 0.0;
- double tst1 = 0.0;
- double eps = Math.pow(2.0, -52.0);
- for (int l = 0; l < n; l++) {
-
- // Find small subdiagonal element
-
- tst1 = Math.max(tst1, Math.abs(d.getQuick(l)) + Math.abs(e.getQuick(l)));
- int m = l;
- while (m < n) {
- if (Math.abs(e.getQuick(m)) <= eps * tst1) {
- break;
- }
- m++;
- }
-
- // If m == l, d.getQuick(l) is an eigenvalue,
- // otherwise, iterate.
-
- if (m > l) {
- do {
- // Compute implicit shift
-
- double g = d.getQuick(l);
- double p = (d.getQuick(l + 1) - g) / (2.0 * e.getQuick(l));
- double r = Math.hypot(p, 1.0);
- if (p < 0) {
- r = -r;
- }
- d.setQuick(l, e.getQuick(l) / (p + r));
- d.setQuick(l + 1, e.getQuick(l) * (p + r));
- double dl1 = d.getQuick(l + 1);
- double h = g - d.getQuick(l);
- for (int i = l + 2; i < n; i++) {
- d.setQuick(i, d.getQuick(i) - h);
- }
- f += h;
-
- // Implicit QL transformation.
-
- p = d.getQuick(m);
- double c = 1.0;
- double c2 = c;
- double c3 = c;
- double el1 = e.getQuick(l + 1);
- double s = 0.0;
- double s2 = 0.0;
- for (int i = m - 1; i >= l; i--) {
- c3 = c2;
- c2 = c;
- s2 = s;
- g = c * e.getQuick(i);
- h = c * p;
- r = Math.hypot(p, e.getQuick(i));
- e.setQuick(i + 1, s * r);
- s = e.getQuick(i) / r;
- c = p / r;
- p = c * d.getQuick(i) - s * g;
- d.setQuick(i + 1, h + s * (c * g + s * d.getQuick(i)));
-
- // Accumulate transformation.
-
- for (int k = 0; k < n; k++) {
- h = v.getQuick(k, i + 1);
- v.setQuick(k, i + 1, s * v.getQuick(k, i) + c * h);
- v.setQuick(k, i, c * v.getQuick(k, i) - s * h);
- }
- }
- p = -s * s2 * c3 * el1 * e.getQuick(l) / dl1;
- e.setQuick(l, s * p);
- d.setQuick(l, c * p);
-
- // Check for convergence.
-
- } while (Math.abs(e.getQuick(l)) > eps * tst1);
- }
- d.setQuick(l, d.getQuick(l) + f);
- e.setQuick(l, 0.0);
- }
-
- // Sort eigenvalues and corresponding vectors.
-
- for (int i = 0; i < n - 1; i++) {
- int k = i;
- double p = d.getQuick(i);
- for (int j = i + 1; j < n; j++) {
- if (d.getQuick(j) > p) {
- k = j;
- p = d.getQuick(j);
- }
- }
- if (k != i) {
- d.setQuick(k, d.getQuick(i));
- d.setQuick(i, p);
- for (int j = 0; j < n; j++) {
- p = v.getQuick(j, i);
- v.setQuick(j, i, v.getQuick(j, k));
- v.setQuick(j, k, p);
- }
- }
- }
- }
-
- // Nonsymmetric reduction to Hessenberg form.
- private Matrix orthes(Matrix x) {
- // Working storage for nonsymmetric algorithm.
- Vector ort = new DenseVector(n);
- Matrix hessenBerg = new DenseMatrix(n, n).assign(x);
-
- // This is derived from the Algol procedures orthes and ortran,
- // by Martin and Wilkinson, Handbook for Auto. Comp.,
- // Vol.ii-Linear Algebra, and the corresponding
- // Fortran subroutines in EISPACK.
-
- int low = 0;
- int high = n - 1;
-
- for (int m = low + 1; m <= high - 1; m++) {
-
- // Scale column.
-
- Vector hColumn = hessenBerg.viewColumn(m - 1).viewPart(m, high - m + 1);
- double scale = hColumn.norm(1);
-
- if (scale != 0.0) {
- // Compute Householder transformation.
-
- ort.viewPart(m, high - m + 1).assign(hColumn, Functions.plusMult(1 / scale));
- double h = ort.viewPart(m, high - m + 1).getLengthSquared();
-
- double g = Math.sqrt(h);
- if (ort.getQuick(m) > 0) {
- g = -g;
- }
- h -= ort.getQuick(m) * g;
- ort.setQuick(m, ort.getQuick(m) - g);
-
- // Apply Householder similarity transformation
- // H = (I-u*u'/h)*H*(I-u*u')/h)
-
- Vector ortPiece = ort.viewPart(m, high - m + 1);
- for (int j = m; j < n; j++) {
- double f = ortPiece.dot(hessenBerg.viewColumn(j).viewPart(m, high - m + 1)) / h;
- hessenBerg.viewColumn(j).viewPart(m, high - m + 1).assign(ortPiece, Functions.plusMult(-f));
- }
-
- for (int i = 0; i <= high; i++) {
- double f = ortPiece.dot(hessenBerg.viewRow(i).viewPart(m, high - m + 1)) / h;
- hessenBerg.viewRow(i).viewPart(m, high - m + 1).assign(ortPiece, Functions.plusMult(-f));
- }
- ort.setQuick(m, scale * ort.getQuick(m));
- hessenBerg.setQuick(m, m - 1, scale * g);
- }
- }
-
- // Accumulate transformations (Algol's ortran).
-
- v.assign(0);
- v.viewDiagonal().assign(1);
-
- for (int m = high - 1; m >= low + 1; m--) {
- if (hessenBerg.getQuick(m, m - 1) != 0.0) {
- ort.viewPart(m + 1, high - m).assign(hessenBerg.viewColumn(m - 1).viewPart(m + 1, high - m));
- for (int j = m; j <= high; j++) {
- double g = ort.viewPart(m, high - m + 1).dot(v.viewColumn(j).viewPart(m, high - m + 1));
- // Double division avoids possible underflow
- g = g / ort.getQuick(m) / hessenBerg.getQuick(m, m - 1);
- v.viewColumn(j).viewPart(m, high - m + 1).assign(ort.viewPart(m, high - m + 1), Functions.plusMult(g));
- }
- }
- }
- return hessenBerg;
- }
-
-
- // Complex scalar division.
- private double cdivr;
- private double cdivi;
-
- private void cdiv(double xr, double xi, double yr, double yi) {
- double r;
- double d;
- if (Math.abs(yr) > Math.abs(yi)) {
- r = yi / yr;
- d = yr + r * yi;
- cdivr = (xr + r * xi) / d;
- cdivi = (xi - r * xr) / d;
- } else {
- r = yr / yi;
- d = yi + r * yr;
- cdivr = (r * xr + xi) / d;
- cdivi = (r * xi - xr) / d;
- }
- }
-
-
- // Nonsymmetric reduction from Hessenberg to real Schur form.
-
- private void hqr2(Matrix h) {
-
- // This is derived from the Algol procedure hqr2,
- // by Martin and Wilkinson, Handbook for Auto. Comp.,
- // Vol.ii-Linear Algebra, and the corresponding
- // Fortran subroutine in EISPACK.
-
- // Initialize
-
- int nn = this.n;
- int n = nn - 1;
- int low = 0;
- int high = nn - 1;
- double eps = Math.pow(2.0, -52.0);
- double exshift = 0.0;
- double p = 0;
- double q = 0;
- double r = 0;
- double s = 0;
- double z = 0;
- double w;
- double x;
- double y;
-
- // Store roots isolated by balanc and compute matrix norm
-
- double norm = h.aggregate(Functions.PLUS, Functions.ABS);
-
- // Outer loop over eigenvalue index
-
- int iter = 0;
- while (n >= low) {
-
- // Look for single small sub-diagonal element
-
- int l = n;
- while (l > low) {
- s = Math.abs(h.getQuick(l - 1, l - 1)) + Math.abs(h.getQuick(l, l));
- if (s == 0.0) {
- s = norm;
- }
- if (Math.abs(h.getQuick(l, l - 1)) < eps * s) {
- break;
- }
- l--;
- }
-
- // Check for convergence
-
- if (l == n) {
- // One root found
- h.setQuick(n, n, h.getQuick(n, n) + exshift);
- d.setQuick(n, h.getQuick(n, n));
- e.setQuick(n, 0.0);
- n--;
- iter = 0;
-
-
- } else if (l == n - 1) {
- // Two roots found
- w = h.getQuick(n, n - 1) * h.getQuick(n - 1, n);
- p = (h.getQuick(n - 1, n - 1) - h.getQuick(n, n)) / 2.0;
- q = p * p + w;
- z = Math.sqrt(Math.abs(q));
- h.setQuick(n, n, h.getQuick(n, n) + exshift);
- h.setQuick(n - 1, n - 1, h.getQuick(n - 1, n - 1) + exshift);
- x = h.getQuick(n, n);
-
- // Real pair
- if (q >= 0) {
- if (p >= 0) {
- z = p + z;
- } else {
- z = p - z;
- }
- d.setQuick(n - 1, x + z);
- d.setQuick(n, d.getQuick(n - 1));
- if (z != 0.0) {
- d.setQuick(n, x - w / z);
- }
- e.setQuick(n - 1, 0.0);
- e.setQuick(n, 0.0);
- x = h.getQuick(n, n - 1);
- s = Math.abs(x) + Math.abs(z);
- p = x / s;
- q = z / s;
- r = Math.sqrt(p * p + q * q);
- p /= r;
- q /= r;
-
- // Row modification
-
- for (int j = n - 1; j < nn; j++) {
- z = h.getQuick(n - 1, j);
- h.setQuick(n - 1, j, q * z + p * h.getQuick(n, j));
- h.setQuick(n, j, q * h.getQuick(n, j) - p * z);
- }
-
- // Column modification
-
- for (int i = 0; i <= n; i++) {
- z = h.getQuick(i, n - 1);
- h.setQuick(i, n - 1, q * z + p * h.getQuick(i, n));
- h.setQuick(i, n, q * h.getQuick(i, n) - p * z);
- }
-
- // Accumulate transformations
-
- for (int i = low; i <= high; i++) {
- z = v.getQuick(i, n - 1);
- v.setQuick(i, n - 1, q * z + p * v.getQuick(i, n));
- v.setQuick(i, n, q * v.getQuick(i, n) - p * z);
- }
-
- // Complex pair
-
- } else {
- d.setQuick(n - 1, x + p);
- d.setQuick(n, x + p);
- e.setQuick(n - 1, z);
- e.setQuick(n, -z);
- }
- n -= 2;
- iter = 0;
-
- // No convergence yet
-
- } else {
-
- // Form shift
-
- x = h.getQuick(n, n);
- y = 0.0;
- w = 0.0;
- if (l < n) {
- y = h.getQuick(n - 1, n - 1);
- w = h.getQuick(n, n - 1) * h.getQuick(n - 1, n);
- }
-
- // Wilkinson's original ad hoc shift
-
- if (iter == 10) {
- exshift += x;
- for (int i = low; i <= n; i++) {
- h.setQuick(i, i, x);
- }
- s = Math.abs(h.getQuick(n, n - 1)) + Math.abs(h.getQuick(n - 1, n - 2));
- x = y = 0.75 * s;
- w = -0.4375 * s * s;
- }
-
- // MATLAB's new ad hoc shift
-
- if (iter == 30) {
- s = (y - x) / 2.0;
- s = s * s + w;
- if (s > 0) {
- s = Math.sqrt(s);
- if (y < x) {
- s = -s;
- }
- s = x - w / ((y - x) / 2.0 + s);
- for (int i = low; i <= n; i++) {
- h.setQuick(i, i, h.getQuick(i, i) - s);
- }
- exshift += s;
- x = y = w = 0.964;
- }
- }
-
- iter++; // (Could check iteration count here.)
-
- // Look for two consecutive small sub-diagonal elements
-
- int m = n - 2;
- while (m >= l) {
- z = h.getQuick(m, m);
- r = x - z;
- s = y - z;
- p = (r * s - w) / h.getQuick(m + 1, m) + h.getQuick(m, m + 1);
- q = h.getQuick(m + 1, m + 1) - z - r - s;
- r = h.getQuick(m + 2, m + 1);
- s = Math.abs(p) + Math.abs(q) + Math.abs(r);
- p /= s;
- q /= s;
- r /= s;
- if (m == l) {
- break;
- }
- double hmag = Math.abs(h.getQuick(m - 1, m - 1)) + Math.abs(h.getQuick(m + 1, m + 1));
- double threshold = eps * Math.abs(p) * (Math.abs(z) + hmag);
- if (Math.abs(h.getQuick(m, m - 1)) * (Math.abs(q) + Math.abs(r)) < threshold) {
- break;
- }
- m--;
- }
-
- for (int i = m + 2; i <= n; i++) {
- h.setQuick(i, i - 2, 0.0);
- if (i > m + 2) {
- h.setQuick(i, i - 3, 0.0);
- }
- }
-
- // Double QR step involving rows l:n and columns m:n
-
- for (int k = m; k <= n - 1; k++) {
- boolean notlast = k != n - 1;
- if (k != m) {
- p = h.getQuick(k, k - 1);
- q = h.getQuick(k + 1, k - 1);
- r = notlast ? h.getQuick(k + 2, k - 1) : 0.0;
- x = Math.abs(p) + Math.abs(q) + Math.abs(r);
- if (x != 0.0) {
- p /= x;
- q /= x;
- r /= x;
- }
- }
- if (x == 0.0) {
- break;
- }
- s = Math.sqrt(p * p + q * q + r * r);
- if (p < 0) {
- s = -s;
- }
- if (s != 0) {
- if (k != m) {
- h.setQuick(k, k - 1, -s * x);
- } else if (l != m) {
- h.setQuick(k, k - 1, -h.getQuick(k, k - 1));
- }
- p += s;
- x = p / s;
- y = q / s;
- z = r / s;
- q /= p;
- r /= p;
-
- // Row modification
-
- for (int j = k; j < nn; j++) {
- p = h.getQuick(k, j) + q * h.getQuick(k + 1, j);
- if (notlast) {
- p += r * h.getQuick(k + 2, j);
- h.setQuick(k + 2, j, h.getQuick(k + 2, j) - p * z);
- }
- h.setQuick(k, j, h.getQuick(k, j) - p * x);
- h.setQuick(k + 1, j, h.getQuick(k + 1, j) - p * y);
- }
-
- // Column modification
-
- for (int i = 0; i <= Math.min(n, k + 3); i++) {
- p = x * h.getQuick(i, k) + y * h.getQuick(i, k + 1);
- if (notlast) {
- p += z * h.getQuick(i, k + 2);
- h.setQuick(i, k + 2, h.getQuick(i, k + 2) - p * r);
- }
- h.setQuick(i, k, h.getQuick(i, k) - p);
- h.setQuick(i, k + 1, h.getQuick(i, k + 1) - p * q);
- }
-
- // Accumulate transformations
-
- for (int i = low; i <= high; i++) {
- p = x * v.getQuick(i, k) + y * v.getQuick(i, k + 1);
- if (notlast) {
- p += z * v.getQuick(i, k + 2);
- v.setQuick(i, k + 2, v.getQuick(i, k + 2) - p * r);
- }
- v.setQuick(i, k, v.getQuick(i, k) - p);
- v.setQuick(i, k + 1, v.getQuick(i, k + 1) - p * q);
- }
- } // (s != 0)
- } // k loop
- } // check convergence
- } // while (n >= low)
-
- // Backsubstitute to find vectors of upper triangular form
-
- if (norm == 0.0) {
- return;
- }
-
- for (n = nn - 1; n >= 0; n--) {
- p = d.getQuick(n);
- q = e.getQuick(n);
-
- // Real vector
-
- double t;
- if (q == 0) {
- int l = n;
- h.setQuick(n, n, 1.0);
- for (int i = n - 1; i >= 0; i--) {
- w = h.getQuick(i, i) - p;
- r = 0.0;
- for (int j = l; j <= n; j++) {
- r += h.getQuick(i, j) * h.getQuick(j, n);
- }
- if (e.getQuick(i) < 0.0) {
- z = w;
- s = r;
- } else {
- l = i;
- if (e.getQuick(i) == 0.0) {
- if (w == 0.0) {
- h.setQuick(i, n, -r / (eps * norm));
- } else {
- h.setQuick(i, n, -r / w);
- }
-
- // Solve real equations
-
- } else {
- x = h.getQuick(i, i + 1);
- y = h.getQuick(i + 1, i);
- q = (d.getQuick(i) - p) * (d.getQuick(i) - p) + e.getQuick(i) * e.getQuick(i);
- t = (x * s - z * r) / q;
- h.setQuick(i, n, t);
- if (Math.abs(x) > Math.abs(z)) {
- h.setQuick(i + 1, n, (-r - w * t) / x);
- } else {
- h.setQuick(i + 1, n, (-s - y * t) / z);
- }
- }
-
- // Overflow control
-
- t = Math.abs(h.getQuick(i, n));
- if (eps * t * t > 1) {
- for (int j = i; j <= n; j++) {
- h.setQuick(j, n, h.getQuick(j, n) / t);
- }
- }
- }
- }
-
- // Complex vector
-
- } else if (q < 0) {
- int l = n - 1;
-
- // Last vector component imaginary so matrix is triangular
-
- if (Math.abs(h.getQuick(n, n - 1)) > Math.abs(h.getQuick(n - 1, n))) {
- h.setQuick(n - 1, n - 1, q / h.getQuick(n, n - 1));
- h.setQuick(n - 1, n, -(h.getQuick(n, n) - p) / h.getQuick(n, n - 1));
- } else {
- cdiv(0.0, -h.getQuick(n - 1, n), h.getQuick(n - 1, n - 1) - p, q);
- h.setQuick(n - 1, n - 1, cdivr);
- h.setQuick(n - 1, n, cdivi);
- }
- h.setQuick(n, n - 1, 0.0);
- h.setQuick(n, n, 1.0);
- for (int i = n - 2; i >= 0; i--) {
- double ra = 0.0;
- double sa = 0.0;
- for (int j = l; j <= n; j++) {
- ra += h.getQuick(i, j) * h.getQuick(j, n - 1);
- sa += h.getQuick(i, j) * h.getQuick(j, n);
- }
- w = h.getQuick(i, i) - p;
-
- if (e.getQuick(i) < 0.0) {
- z = w;
- r = ra;
- s = sa;
- } else {
- l = i;
- if (e.getQuick(i) == 0) {
- cdiv(-ra, -sa, w, q);
- h.setQuick(i, n - 1, cdivr);
- h.setQuick(i, n, cdivi);
- } else {
-
- // Solve complex equations
-
- x = h.getQuick(i, i + 1);
- y = h.getQuick(i + 1, i);
- double vr = (d.getQuick(i) - p) * (d.getQuick(i) - p) + e.getQuick(i) * e.getQuick(i) - q * q;
- double vi = (d.getQuick(i) - p) * 2.0 * q;
- if (vr == 0.0 && vi == 0.0) {
- double hmag = Math.abs(x) + Math.abs(y);
- vr = eps * norm * (Math.abs(w) + Math.abs(q) + hmag + Math.abs(z));
- }
- cdiv(x * r - z * ra + q * sa, x * s - z * sa - q * ra, vr, vi);
- h.setQuick(i, n - 1, cdivr);
- h.setQuick(i, n, cdivi);
- if (Math.abs(x) > (Math.abs(z) + Math.abs(q))) {
- h.setQuick(i + 1, n - 1, (-ra - w * h.getQuick(i, n - 1) + q * h.getQuick(i, n)) / x);
- h.setQuick(i + 1, n, (-sa - w * h.getQuick(i, n) - q * h.getQuick(i, n - 1)) / x);
- } else {
- cdiv(-r - y * h.getQuick(i, n - 1), -s - y * h.getQuick(i, n), z, q);
- h.setQuick(i + 1, n - 1, cdivr);
- h.setQuick(i + 1, n, cdivi);
- }
- }
-
- // Overflow control
-
- t = Math.max(Math.abs(h.getQuick(i, n - 1)), Math.abs(h.getQuick(i, n)));
- if (eps * t * t > 1) {
- for (int j = i; j <= n; j++) {
- h.setQuick(j, n - 1, h.getQuick(j, n - 1) / t);
- h.setQuick(j, n, h.getQuick(j, n) / t);
- }
- }
- }
- }
- }
- }
-
- // Vectors of isolated roots
-
- for (int i = 0; i < nn; i++) {
- if (i < low || i > high) {
- for (int j = i; j < nn; j++) {
- v.setQuick(i, j, h.getQuick(i, j));
- }
- }
- }
-
- // Back transformation to get eigenvectors of original matrix
-
- for (int j = nn - 1; j >= low; j--) {
- for (int i = low; i <= high; i++) {
- z = 0.0;
- for (int k = low; k <= Math.min(j, high); k++) {
- z += v.getQuick(i, k) * h.getQuick(k, j);
- }
- v.setQuick(i, j, z);
- }
- }
- }
-
- private static boolean isSymmetric(Matrix a) {
- /*
- Symmetry flag.
- */
- int n = a.columnSize();
-
- boolean isSymmetric = true;
- for (int j = 0; (j < n) && isSymmetric; j++) {
- for (int i = 0; (i < n) && isSymmetric; i++) {
- isSymmetric = a.getQuick(i, j) == a.getQuick(j, i);
- }
- }
- return isSymmetric;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/solver/JacobiConditioner.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/solver/JacobiConditioner.java b/math/src/main/java/org/apache/mahout/math/solver/JacobiConditioner.java
deleted file mode 100644
index 7524564..0000000
--- a/math/src/main/java/org/apache/mahout/math/solver/JacobiConditioner.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.solver;
-
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-
-/**
- * Implements the Jacobi preconditioner for a matrix A. This is defined as inv(diag(A)).
- */
-public final class JacobiConditioner implements Preconditioner {
-
- private final DenseVector inverseDiagonal;
-
- public JacobiConditioner(Matrix a) {
- if (a.numCols() != a.numRows()) {
- throw new IllegalArgumentException("Matrix must be square.");
- }
-
- inverseDiagonal = new DenseVector(a.numCols());
- for (int i = 0; i < a.numCols(); ++i) {
- inverseDiagonal.setQuick(i, 1.0 / a.getQuick(i, i));
- }
- }
-
- @Override
- public Vector precondition(Vector v) {
- return v.times(inverseDiagonal);
- }
-
-}

r***@apache.org

2018-06-27 14:51:33 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java b/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java
deleted file mode 100644
index 723e7d0..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/math/Polynomial.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.math;
-
-/**
- * Polynomial functions.
- */
-public final class Polynomial {
-
- private Polynomial() {
- }
-
- /**
- * Evaluates the given polynomial of degree <tt>N</tt> at <tt>x</tt>, assuming coefficient of N is 1.0. Otherwise same
- * as <tt>polevl()</tt>.
- * <pre>
- * 2 N
- * y = C + C x + C x +...+ C x
- * 0 1 2 N
- *
- * where C = 1 and hence is omitted from the array.
- * N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C , ..., coef[N-1] = C .
- * N-1 0
- *
- * Calling arguments are otherwise the same as polevl().
- * </pre>
- * In the interest of speed, there are no checks for out of bounds arithmetic.
- *
- * @param x argument to the polynomial.
- * @param coef the coefficients of the polynomial.
- * @param N the degree of the polynomial.
- */
- public static double p1evl(double x, double[] coef, int N) {
-
- double ans = x + coef[0];
-
- for (int i = 1; i < N; i++) {
- ans = ans * x + coef[i];
- }
-
- return ans;
- }
-
- /**
- * Evaluates the given polynomial of degree <tt>N</tt> at <tt>x</tt>.
- * <pre>
- * 2 N
- * y = C + C x + C x +...+ C x
- * 0 1 2 N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C , ..., coef[N] = C .
- * N 0
- * </pre>
- * In the interest of speed, there are no checks for out of bounds arithmetic.
- *
- * @param x argument to the polynomial.
- * @param coef the coefficients of the polynomial.
- * @param N the degree of the polynomial.
- */
- public static double polevl(double x, double[] coef, int N) {
- double ans = coef[0];
-
- for (int i = 1; i <= N; i++) {
- ans = ans * x + coef[i];
- }
-
- return ans;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/math/package-info.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/math/package-info.java b/math/src/main/java/org/apache/mahout/math/jet/math/package-info.java
deleted file mode 100644
index 3cda850..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/math/package-info.java
+++ /dev/null
@@ -1,5 +0,0 @@
-/**
- * Tools for basic and advanced mathematics: Arithmetics and Algebra, Polynomials and Chebyshev series, Bessel and Airy
- * functions, Function Objects for generic function evaluation, etc.
- */
-package org.apache.mahout.math.jet.math;

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/AbstractContinousDistribution.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/AbstractContinousDistribution.java b/math/src/main/java/org/apache/mahout/math/jet/random/AbstractContinousDistribution.java
deleted file mode 100644
index 8ca03d0..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/AbstractContinousDistribution.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-/**
- * Abstract base class for all continuous distributions. Continuous distributions have
- * probability density and a cumulative distribution functions.
- *
- */
-public abstract class AbstractContinousDistribution extends AbstractDistribution {
- public double cdf(double x) {
- throw new UnsupportedOperationException("Can't compute pdf for " + this.getClass().getName());
- }
-
- public double pdf(double x) {
- throw new UnsupportedOperationException("Can't compute pdf for " + this.getClass().getName());
- }
-
- /**
- * @return A random number from the distribution; returns <tt>(int) Math.round(nextDouble())</tt>.
- * Override this method if necessary.
- */
- @Override
- public int nextInt() {
- return (int) Math.round(nextDouble());
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDiscreteDistribution.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDiscreteDistribution.java b/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDiscreteDistribution.java
deleted file mode 100644
index d93d76c..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDiscreteDistribution.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-/**
- * Abstract base class for all discrete distributions.
- *
- */
-public abstract class AbstractDiscreteDistribution extends AbstractDistribution {
-
- /** Makes this class non instantiable, but still let's others inherit from it. */
- protected AbstractDiscreteDistribution() {
- }
-
- /** Returns a random number from the distribution; returns <tt>(double) nextInt()</tt>. */
- @Override
- public double nextDouble() {
- return nextInt();
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDistribution.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDistribution.java b/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDistribution.java
deleted file mode 100644
index 8e9cb0e..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/AbstractDistribution.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-import java.util.Random;
-
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.IntFunction;
-
-public abstract class AbstractDistribution extends DoubleFunction implements IntFunction {
-
- private Random randomGenerator;
-
- /** Makes this class non instantiable, but still let's others inherit from it. */
- protected AbstractDistribution() {
- }
-
- protected Random getRandomGenerator() {
- return randomGenerator;
- }
-
- protected double randomDouble() {
- return randomGenerator.nextDouble();
- }
-
- /**
- * Equivalent to <tt>nextDouble()</tt>. This has the effect that distributions can now be used as function objects,
- * returning a random number upon function evaluation.
- */
- @Override
- public double apply(double dummy) {
- return nextDouble();
- }
-
- /**
- * Equivalent to <tt>nextInt()</tt>. This has the effect that distributions can now be used as function objects,
- * returning a random number upon function evaluation.
- */
- @Override
- public int apply(int dummy) {
- return nextInt();
- }
-
- /**
- * Returns a random number from the distribution.
- * @return A new sample from this distribution.
- */
- public abstract double nextDouble();
-
- /**
- * @return
- * A random number from the distribution; returns <tt>(int) Math.round(nextDouble())</tt>. Override this
- * method if necessary.
- */
- public abstract int nextInt();
-
- /**
- * Sets the uniform random generator internally used.
- * @param randomGenerator the new PRNG
- */
- public void setRandomGenerator(Random randomGenerator) {
- this.randomGenerator = randomGenerator;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/Exponential.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/Exponential.java b/math/src/main/java/org/apache/mahout/math/jet/random/Exponential.java
deleted file mode 100644
index 06472c2..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/Exponential.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-import java.util.Locale;
-import java.util.Random;
-
-public class Exponential extends AbstractContinousDistribution {
- // rate parameter for the distribution. Mean is 1/lambda.
- private double lambda;
-
- /**
- * Provides a negative exponential distribution given a rate parameter lambda and an underlying
- * random number generator. The mean of this distribution will be equal to 1/lambda.
- *
- * @param lambda The rate parameter of the distribution.
- * @param randomGenerator The PRNG that is used to generate values.
- */
- public Exponential(double lambda, Random randomGenerator) {
- setRandomGenerator(randomGenerator);
- this.lambda = lambda;
- }
-
- /**
- * Returns the cumulative distribution function.
- * @param x The point at which the cumulative distribution function is to be evaluated.
- * @return Returns the integral from -infinity to x of the PDF, also known as the cumulative distribution
- * function.
- */
- @Override
- public double cdf(double x) {
- if (x <= 0.0) {
- return 0.0;
- }
- return 1.0 - Math.exp(-x * lambda);
- }
-
- /**
- * Returns a random number from the distribution.
- */
- @Override
- public double nextDouble() {
- return -Math.log1p(-randomDouble()) / lambda;
- }
-
- /**
- * Returns the value of the probability density function at a particular point.
- * @param x The point at which the probability density function is to be evaluated.
- * @return The value of the probability density function at the specified point.
- */
- @Override
- public double pdf(double x) {
- if (x < 0.0) {
- return 0.0;
- }
- return lambda * Math.exp(-x * lambda);
- }
-
- /**
- * Sets the rate parameter.
- * @param lambda The new value of the rate parameter.
- */
- public void setState(double lambda) {
- this.lambda = lambda;
- }
-
- /**
- * Returns a String representation of the receiver.
- */
- @Override
- public String toString() {
- return String.format(Locale.ENGLISH, "%s(%.4f)", this.getClass().getName(), lambda);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java b/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java
deleted file mode 100644
index 914157b..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/Gamma.java
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-import org.apache.mahout.math.jet.stat.Probability;
-
-import java.util.Random;
-
-public class Gamma extends AbstractContinousDistribution {
- // shape
- private final double alpha;
-
- // rate
- private final double rate;
-
- /**
- * Constructs a Gamma distribution with a given shape (alpha) and rate (beta).
- *
- * @param alpha The shape parameter.
- * @param rate The rate parameter.
- * @param randomGenerator The random number generator that generates bits for us.
- * @throws IllegalArgumentException if <tt>alpha <= 0.0 || alpha <= 0.0</tt>.
- */
- public Gamma(double alpha, double rate, Random randomGenerator) {
- this.alpha = alpha;
- this.rate = rate;
- setRandomGenerator(randomGenerator);
- }
-
- /**
- * Returns the cumulative distribution function.
- * @param x The end-point where the cumulation should end.
- */
- @Override
- public double cdf(double x) {
- return Probability.gamma(alpha, rate, x);
- }
-
- /** Returns a random number from the distribution. */
- @Override
- public double nextDouble() {
- return nextDouble(alpha, rate);
- }
-
- /** Returns a random number from the distribution; bypasses the internal state.
- * *
- * Gamma Distribution - Acceptance Rejection combined with *
- * Acceptance Complement *
- * *
- ******************************************************************
- * *
- * FUNCTION: - gds samples a random number from the standard *
- * gamma distribution with parameter a > 0. *
- * Acceptance Rejection gs for a < 1 , *
- * Acceptance Complement gd for a >= 1 . *
- * REFERENCES: - J.H. Ahrens, U. Dieter (1974): Computer methods *
- * for sampling from gamma, beta, Poisson and *
- * binomial distributions, Computing 12, 223-246. *
- * - J.H. Ahrens, U. Dieter (1982): Generating gamma *
- * variates by a modified rejection technique, *
- * Communications of the ACM 25, 47-54. *
- * SUBPROGRAMS: - drand(seed) ... (0,1)-Uniform generator with *
- * unsigned long integer *seed *
- * - NORMAL(seed) ... Normal generator N(0,1). *
- * *
- * @param alpha Shape parameter.
- * @param rate Rate parameter (=1/scale).
- * @return A gamma distributed sample.
- */
- public double nextDouble(double alpha, double rate) {
- if (alpha <= 0.0) {
- throw new IllegalArgumentException();
- }
- if (rate <= 0.0) {
- throw new IllegalArgumentException();
- }
-
- double gds;
- double b = 0.0;
- if (alpha < 1.0) { // CASE A: Acceptance rejection algorithm gs
- b = 1.0 + 0.36788794412 * alpha; // Step 1
- while (true) {
- double p = b * randomDouble();
- if (p <= 1.0) { // Step 2. Case gds <= 1
- gds = Math.exp(Math.log(p) / alpha);
- if (Math.log(randomDouble()) <= -gds) {
- return gds / rate;
- }
- } else { // Step 3. Case gds > 1
- gds = -Math.log((b - p) / alpha);
- if (Math.log(randomDouble()) <= (alpha - 1.0) * Math.log(gds)) {
- return gds / rate;
- }
- }
- }
- } else { // CASE B: Acceptance complement algorithm gd (gaussian distribution, box muller transformation)
- double ss = 0.0;
- double s = 0.0;
- double d = 0.0;
- if (alpha != -1.0) { // Step 1. Preparations
- ss = alpha - 0.5;
- s = Math.sqrt(ss);
- d = 5.656854249 - 12.0 * s;
- }
- // Step 2. Normal deviate
- double v12;
- double v1;
- do {
- v1 = 2.0 * randomDouble() - 1.0;
- double v2 = 2.0 * randomDouble() - 1.0;
- v12 = v1 * v1 + v2 * v2;
- } while (v12 > 1.0);
- double t = v1 * Math.sqrt(-2.0 * Math.log(v12) / v12);
- double x = s + 0.5 * t;
- gds = x * x;
- if (t >= 0.0) {
- return gds / rate;
- } // Immediate acceptance
-
- double u = randomDouble();
- if (d * u <= t * t * t) {
- return gds / rate;
- } // Squeeze acceptance
-
- double q0 = 0.0;
- double si = 0.0;
- double c = 0.0;
- if (alpha != -1.0) { // Step 4. Set-up for hat case
- double r = 1.0 / alpha;
- double q9 = 0.0001710320;
- double q8 = -0.0004701849;
- double q7 = 0.0006053049;
- double q6 = 0.0003340332;
- double q5 = -0.0003349403;
- double q4 = 0.0015746717;
- double q3 = 0.0079849875;
- double q2 = 0.0208333723;
- double q1 = 0.0416666664;
- q0 = ((((((((q9 * r + q8) * r + q7) * r + q6) * r + q5) * r + q4) * r + q3) * r + q2) * r + q1) * r;
- if (alpha > 3.686) {
- if (alpha > 13.022) {
- b = 1.77;
- si = 0.75;
- c = 0.1515 / s;
- } else {
- b = 1.654 + 0.0076 * ss;
- si = 1.68 / s + 0.275;
- c = 0.062 / s + 0.024;
- }
- } else {
- b = 0.463 + s - 0.178 * ss;
- si = 1.235;
- c = 0.195 / s - 0.079 + 0.016 * s;
- }
- }
- double v;
- double q;
- double a9 = 0.104089866;
- double a8 = -0.112750886;
- double a7 = 0.110368310;
- double a6 = -0.124385581;
- double a5 = 0.142873973;
- double a4 = -0.166677482;
- double a3 = 0.199999867;
- double a2 = -0.249999949;
- double a1 = 0.333333333;
- if (x > 0.0) { // Step 5. Calculation of q
- v = t / (s + s); // Step 6.
- if (Math.abs(v) > 0.25) {
- q = q0 - s * t + 0.25 * t * t + (ss + ss) * Math.log1p(v);
- } else {
- q = q0 + 0.5 * t * t * ((((((((a9 * v + a8) * v + a7) * v + a6)
- * v + a5) * v + a4) * v + a3) * v + a2) * v + a1) * v;
- } // Step 7. Quotient acceptance
- if (Math.log1p(-u) <= q) {
- return gds / rate;
- }
- }
-
- double e7 = 0.000247453;
- double e6 = 0.001353826;
- double e5 = 0.008345522;
- double e4 = 0.041664508;
- double e3 = 0.166666848;
- double e2 = 0.499999994;
- double e1 = 1.000000000;
- while (true) { // Step 8. Double exponential deviate t
- double sign_u;
- double e;
- do {
- e = -Math.log(randomDouble());
- u = randomDouble();
- u = u + u - 1.0;
- sign_u = u > 0 ? 1.0 : -1.0;
- t = b + e * si * sign_u;
- } while (t <= -0.71874483771719); // Step 9. Rejection of t
- v = t / (s + s); // Step 10. New q(t)
- if (Math.abs(v) > 0.25) {
- q = q0 - s * t + 0.25 * t * t + (ss + ss) * Math.log1p(v);
- } else {
- q = q0 + 0.5 * t * t * ((((((((a9 * v + a8) * v + a7) * v + a6)
- * v + a5) * v + a4) * v + a3) * v + a2) * v + a1) * v;
- }
- if (q <= 0.0) {
- continue;
- } // Step 11.
- double w;
- if (q > 0.5) {
- w = Math.exp(q) - 1.0;
- } else {
- w = ((((((e7 * q + e6) * q + e5) * q + e4) * q + e3) * q + e2) * q + e1) * q;
- } // Step 12. Hat acceptance
- if (c * u * sign_u <= w * Math.exp(e - 0.5 * t * t)) {
- x = s + 0.5 * t;
- return x * x / rate;
- }
- }
- }
- }
-
- /** Returns the probability distribution function.
- * @param x Where to compute the density function.
- *
- * @return The value of the gamma density at x.
- */
- @Override
- public double pdf(double x) {
- if (x < 0) {
- throw new IllegalArgumentException();
- }
- if (x == 0) {
- if (alpha == 1.0) {
- return rate;
- } else if (alpha < 1) {
- return Double.POSITIVE_INFINITY;
- } else {
- return 0;
- }
- }
- if (alpha == 1.0) {
- return rate * Math.exp(-x * rate);
- }
- return rate * Math.exp((alpha - 1.0) * Math.log(x * rate) - x * rate - logGamma(alpha));
- }
-
- @Override
- public String toString() {
- return this.getClass().getName() + '(' + rate + ',' + alpha + ')';
- }
-
- /** Returns a quick approximation of <tt>log(gamma(x))</tt>. */
- public static double logGamma(double x) {
-
- if (x <= 0.0 /* || x > 1.3e19 */) {
- return -999;
- }
-
- double z;
- for (z = 1.0; x < 11.0; x++) {
- z *= x;
- }
-
- double r = 1.0 / (x * x);
- double c6 = -1.9175269175269175e-03;
- double c5 = 8.4175084175084175e-04;
- double c4 = -5.9523809523809524e-04;
- double c3 = 7.9365079365079365e-04;
- double c2 = -2.7777777777777777e-03;
- double c1 = 8.3333333333333333e-02;
- double g = c1 + r * (c2 + r * (c3 + r * (c4 + r * (c5 + r + c6))));
- double c0 = 9.1893853320467274e-01;
- g = (x - 0.5) * Math.log(x) - x + c0 + g / x;
- if (z == 1.0) {
- return g;
- }
- return g - Math.log(z);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/NegativeBinomial.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/NegativeBinomial.java b/math/src/main/java/org/apache/mahout/math/jet/random/NegativeBinomial.java
deleted file mode 100644
index 1e577eb..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/NegativeBinomial.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-import org.apache.mahout.math.jet.math.Arithmetic;
-import org.apache.mahout.math.jet.stat.Probability;
-
-import java.util.Random;
-
-/** Mostly deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */
-public final class NegativeBinomial extends AbstractDiscreteDistribution {
-
- private final int r;
- private final double p;
-
- private final Gamma gamma;
- private final Poisson poisson;
-
- /**
- * Constructs a Negative Binomial distribution which describes the probability of getting
- * a particular number of negative trials (k) before getting a fixed number of positive
- * trials (r) where each positive trial has probability (p) of being successful.
- *
- * @param r the required number of positive trials.
- * @param p the probability of success.
- * @param randomGenerator a uniform random number generator.
- */
- public NegativeBinomial(int r, double p, Random randomGenerator) {
- setRandomGenerator(randomGenerator);
- this.r = r;
- this.p = p;
- this.gamma = new Gamma(r, 1, randomGenerator);
- this.poisson = new Poisson(0.0, randomGenerator);
- }
-
- /**
- * Returns the cumulative distribution function.
- */
- public double cdf(int k) {
- return Probability.negativeBinomial(k, r, p);
- }
-
- /**
- * Returns the probability distribution function.
- */
- public double pdf(int k) {
- return Arithmetic.binomial(k + r - 1, r - 1) * Math.pow(p, r) * Math.pow(1.0 - p, k);
- }
-
- @Override
- public int nextInt() {
- return nextInt(r, p);
- }
-
- /**
- * Returns a sample from this distribution. The value returned will
- * be the number of negative samples required before achieving r
- * positive samples. Each successive sample is taken independently
- * from a Bernouli process with probability p of success.
- *
- * The algorithm used is taken from J.H. Ahrens, U. Dieter (1974):
- * Computer methods for sampling from gamma, beta, Poisson and
- * binomial distributions, Computing 12, 223--246.
- *
- * This algorithm is essentially the same as described at
- * http://en.wikipedia.org/wiki/Negative_binomial_distribution#Gamma.E2.80.93Poisson_mixture
- * except that the notion of positive and negative outcomes is uniformly
- * inverted. Because the inversion is complete and consistent, this
- * definition is effectively identical to that defined on wikipedia.
- */
- public int nextInt(int r, double p) {
- return this.poisson.nextInt(gamma.nextDouble(r, p / (1.0 - p)));
- }
-
- /**
- * Returns a String representation of the receiver.
- */
- @Override
- public String toString() {
- return this.getClass().getName() + '(' + r + ',' + p + ')';
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/Normal.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/Normal.java b/math/src/main/java/org/apache/mahout/math/jet/random/Normal.java
deleted file mode 100644
index 7ceac22..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/Normal.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-import org.apache.mahout.math.jet.stat.Probability;
-
-import java.util.Locale;
-import java.util.Random;
-
-/**
- * Implements a normal distribution specified mean and standard deviation.
- */
-public class Normal extends AbstractContinousDistribution {
-
- private double mean;
- private double variance;
- private double standardDeviation;
-
- private double cache; // cache for Box-Mueller algorithm
- private boolean cacheFilled; // Box-Mueller
-
- private double normalizer; // performance cache
-
- /**
- * @param mean The mean of the resulting distribution.
- * @param standardDeviation The standard deviation of the distribution.
- * @param randomGenerator The random number generator to use. This can be null if you don't
- * need to generate any numbers.
- */
- public Normal(double mean, double standardDeviation, Random randomGenerator) {
- setRandomGenerator(randomGenerator);
- setState(mean, standardDeviation);
- }
-
- /**
- * Returns the cumulative distribution function.
- */
- @Override
- public double cdf(double x) {
- return Probability.normal(mean, variance, x);
- }
-
- /** Returns the probability density function. */
- @Override
- public double pdf(double x) {
- double diff = x - mean;
- return normalizer * Math.exp(-(diff * diff) / (2.0 * variance));
- }
-
- /**
- * Returns a random number from the distribution.
- */
- @Override
- public double nextDouble() {
- // Uses polar Box-Muller transformation.
- if (cacheFilled) {
- cacheFilled = false;
- return cache;
- }
-
- double x;
- double y;
- double r;
- do {
- x = 2.0 * randomDouble() - 1.0;
- y = 2.0 * randomDouble() - 1.0;
- r = x * x + y * y;
- } while (r >= 1.0);
-
- double z = Math.sqrt(-2.0 * Math.log(r) / r);
- cache = this.mean + this.standardDeviation * x * z;
- cacheFilled = true;
- return this.mean + this.standardDeviation * y * z;
- }
-
- /** Sets the uniform random generator internally used. */
- @Override
- public final void setRandomGenerator(Random randomGenerator) {
- super.setRandomGenerator(randomGenerator);
- this.cacheFilled = false;
- }
-
- /**
- * Sets the mean and variance.
- * @param mean The new value for the mean.
- * @param standardDeviation The new value for the standard deviation.
- */
- public final void setState(double mean, double standardDeviation) {
- if (mean != this.mean || standardDeviation != this.standardDeviation) {
- this.mean = mean;
- this.standardDeviation = standardDeviation;
- this.variance = standardDeviation * standardDeviation;
- this.cacheFilled = false;
-
- this.normalizer = 1.0 / Math.sqrt(2.0 * Math.PI * variance);
- }
- }
-
- /** Returns a String representation of the receiver. */
- @Override
- public String toString() {
- return String.format(Locale.ENGLISH, "%s(m=%f, sd=%f)", this.getClass().getName(), mean, standardDeviation);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/Poisson.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/Poisson.java b/math/src/main/java/org/apache/mahout/math/jet/random/Poisson.java
deleted file mode 100644
index 497691e..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/Poisson.java
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-import org.apache.mahout.math.jet.math.Arithmetic;
-
-import java.util.Random;
-
-/** Partially deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */
-public final class Poisson extends AbstractDiscreteDistribution {
-
- private final double mean;
-
- // precomputed and cached values (for performance only)
- // cache for < SWITCH_MEAN
- private double myOld = -1.0;
- private double p;
- private double q;
- private double p0;
- private final double[] pp = new double[36];
- private int llll;
-
- // cache for >= SWITCH_MEAN
- private double myLast = -1.0;
- private double ll;
- private int k2;
- private int k4;
- private int k1;
- private int k5;
- private double dl;
- private double dr;
- private double r1;
- private double r2;
- private double r4;
- private double r5;
- private double lr;
- private double lMy;
- private double cPm;
- private double f1;
- private double f2;
- private double f4;
- private double f5;
- private double p1;
- private double p2;
- private double p3;
- private double p4;
- private double p5;
- private double p6;
-
- // cache for both;
-
-
- private static final double MEAN_MAX = Integer.MAX_VALUE;
- // for all means larger than that, we don't try to compute a poisson deviation, but return the mean.
- private static final double SWITCH_MEAN = 10.0; // switch from method A to method B
-
-
- /** Constructs a poisson distribution. Example: mean=1.0. */
- public Poisson(double mean, Random randomGenerator) {
- setRandomGenerator(randomGenerator);
- this.mean = mean;
- }
-
- private static double f(int k, double lNu, double cPm) {
- return Math.exp(k * lNu - Arithmetic.logFactorial(k) - cPm);
- }
-
- @Override
- public int nextInt() {
- return nextInt(mean);
- }
-
- /** Returns a random number from the distribution; bypasses the internal state. */
- public int nextInt(double theMean) {
- /******************************************************************
- * *
- * Poisson Distribution - Patchwork Rejection/Inversion *
- * *
- ******************************************************************
- * *
- * For parameter my < 10 Tabulated Inversion is applied. *
- * For my >= 10 Patchwork Rejection is employed: *
- * The area below the histogram function f(x) is rearranged in *
- * its body by certain point reflections. Within a large center *
- * interval variates are sampled efficiently by rejection from *
- * uniform hats. Rectangular immediate acceptance regions speed *
- * up the generation. The remaining tails are covered by *
- * exponential functions. *
- * *
- *****************************************************************/
- Random gen = getRandomGenerator();
-
- //double t, g, my_k;
-
- //double gx, gy, px, py, e, x, xx, delta, v;
- //int sign;
-
- //static double p,q,p0,pp[36];
- //static long ll,m;
-
- int m;
- if (theMean < SWITCH_MEAN) { // CASE B: Inversion- start new table and calculate p0
- if (theMean != myOld) {
- myOld = theMean;
- llll = 0;
- p = Math.exp(-theMean);
- q = p;
- p0 = p;
- //for (k=pp.length; --k >=0;) pp[k] = 0;
- }
- m = theMean > 1.0 ? (int) theMean : 1;
- while (true) {
- double u = gen.nextDouble();
- int k = 0;
- if (u <= p0) {
- return k;
- }
- if (llll != 0) { // Step T. Table comparison
- int i = u > 0.458 ? Math.min(llll, m) : 1;
- for (k = i; k <= llll; k++) {
- if (u <= pp[k]) {
- return k;
- }
- }
- if (llll == 35) {
- continue;
- }
- }
- for (k = llll + 1; k <= 35; k++) { // Step C. Creation of new prob.
- p *= theMean / k;
- q += p;
- pp[k] = q;
- if (u <= q) {
- llll = k;
- return k;
- }
- }
- llll = 35;
- }
- // end my < SWITCH_MEAN
- } else if (theMean < MEAN_MAX) { // CASE A: acceptance complement
- //static double my_last = -1.0;
- //static long int m, k2, k4, k1, k5;
- //static double dl, dr, r1, r2, r4, r5, ll, lr, l_my, c_pm,
- // f1, f2, f4, f5, p1, p2, p3, p4, p5, p6;
-
- m = (int) theMean;
- if (theMean != myLast) { // set-up
- myLast = theMean;
-
- // approximate deviation of reflection points k2, k4 from my - 1/2
- double Ds = Math.sqrt(theMean + 0.25);
-
- // mode m, reflection points k2 and k4, and points k1 and k5, which
- // delimit the centre region of h(x)
- k2 = (int) Math.ceil(theMean - 0.5 - Ds);
- k4 = (int) (theMean - 0.5 + Ds);
- k1 = k2 + k2 - m + 1;
- k5 = k4 + k4 - m;
-
- // range width of the critical left and right centre region
- dl = k2 - k1;
- dr = k5 - k4;
-
- // recurrence constants r(k) = p(k)/p(k-1) at k = k1, k2, k4+1, k5+1
- r1 = theMean / k1;
- r2 = theMean / k2;
- r4 = theMean / (k4 + 1);
- r5 = theMean / (k5 + 1);
-
- // reciprocal values of the scale parameters of expon. tail envelopes
- ll = Math.log(r1); // expon. tail left
- lr = -Math.log(r5); // expon. tail right
-
- // Poisson constants, necessary for computing function values f(k)
- lMy = Math.log(theMean);
- cPm = m * lMy - Arithmetic.logFactorial(m);
-
- // function values f(k) = p(k)/p(m) at k = k2, k4, k1, k5
- f2 = f(k2, lMy, cPm);
- f4 = f(k4, lMy, cPm);
- f1 = f(k1, lMy, cPm);
- f5 = f(k5, lMy, cPm);
-
- // area of the two centre and the two exponential tail regions
- // area of the two immediate acceptance regions between k2, k4
- p1 = f2 * (dl + 1.0); // immed. left
- p2 = f2 * dl + p1; // centre left
- p3 = f4 * (dr + 1.0) + p2; // immed. right
- p4 = f4 * dr + p3; // centre right
- p5 = f1 / ll + p4; // expon. tail left
- p6 = f5 / lr + p5; // expon. tail right
- } // end set-up
-
- while (true) {
- // generate uniform number U -- U(0, p6)
- // case distinction corresponding to U
- double W;
- double V;
- double U;
- int Y;
- int X;
- int Dk;
- if ((U = gen.nextDouble() * p6) < p2) { // centre left
-
- // immediate acceptance region R2 = [k2, m) *[0, f2), X = k2, ... m -1
- if ((V = U - p1) < 0.0) {
- return k2 + (int) (U / f2);
- }
- // immediate acceptance region R1 = [k1, k2)*[0, f1), X = k1, ... k2-1
- if ((W = V / dl) < f1) {
- return k1 + (int) (V / f1);
- }
-
- // computation of candidate X < k2, and its counterpart Y > k2
- // either squeeze-acceptance of X or acceptance-rejection of Y
- Dk = gen.nextInt((int) dl) + 1;
- if (W <= f2 - Dk * (f2 - f2 / r2)) { // quick accept of
- return k2 - Dk; // X = k2 - Dk
- }
- if ((V = f2 + f2 - W) < 1.0) { // quick reject of Y
- Y = k2 + Dk;
- if (V <= f2 + Dk * (1.0 - f2) / (dl + 1.0)) { // quick accept of
- return Y; // Y = k2 + Dk
- }
- if (V <= f(Y, lMy, cPm)) {
- return Y;
- } // final accept of Y
- }
- X = k2 - Dk;
- } else if (U < p4) { // centre right
- // immediate acceptance region R3 = [m, k4+1)*[0, f4), X = m, ... k4
- if ((V = U - p3) < 0.0) {
- return k4 - (int) ((U - p2) / f4);
- }
- // immediate acceptance region R4 = [k4+1, k5+1)*[0, f5)
- if ((W = V / dr) < f5) {
- return k5 - (int) (V / f5);
- }
-
- // computation of candidate X > k4, and its counterpart Y < k4
- // either squeeze-acceptance of X or acceptance-rejection of Y
- Dk = gen.nextInt((int) dr) + 1;
- if (W <= f4 - Dk * (f4 - f4 * r4)) { // quick accept of
- return k4 + Dk; // X = k4 + Dk
- }
- if ((V = f4 + f4 - W) < 1.0) { // quick reject of Y
- Y = k4 - Dk;
- if (V <= f4 + Dk * (1.0 - f4) / dr) { // quick accept of
- return Y; // Y = k4 - Dk
- }
- if (V <= f(Y, lMy, cPm)) {
- return Y;
- } // final accept of Y
- }
- X = k4 + Dk;
- } else {
- W = gen.nextDouble();
- if (U < p5) { // expon. tail left
- Dk = (int) (1.0 - Math.log(W) / ll);
- if ((X = k1 - Dk) < 0) {
- continue;
- } // 0 <= X <= k1 - 1
- W *= (U - p4) * ll; // W -- U(0, h(x))
- if (W <= f1 - Dk * (f1 - f1 / r1)) {
- return X;
- } // quick accept of X
- } else { // expon. tail right
- Dk = (int) (1.0 - Math.log(W) / lr);
- X = k5 + Dk; // X >= k5 + 1
- W *= (U - p5) * lr; // W -- U(0, h(x))
- if (W <= f5 - Dk * (f5 - f5 * r5)) {
- return X;
- } // quick accept of X
- }
- }
-
- // acceptance-rejection test of candidate X from the original area
- // test, whether W <= f(k), with W = U*h(x) and U -- U(0, 1)
- // log f(X) = (X - m)*log(my) - log X! + log m!
- if (Math.log(W) <= X * lMy - Arithmetic.logFactorial(X) - cPm) {
- return X;
- }
- }
- } else { // mean is too large
- return (int) theMean;
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/Uniform.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/Uniform.java b/math/src/main/java/org/apache/mahout/math/jet/random/Uniform.java
deleted file mode 100644
index 32c8b90..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/Uniform.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random;
-
-import org.apache.mahout.common.RandomUtils;
-
-import java.util.Random;
-
-public class Uniform extends AbstractContinousDistribution {
-
- private double min;
- private double max;
-
- /**
- * Constructs a uniform distribution with the given minimum and maximum, using a {@link
- * org.apache.mahout.math.jet.random.engine.MersenneTwister} seeded with the given seed.
- */
- public Uniform(double min, double max, int seed) {
- this(min, max, RandomUtils.getRandom(seed));
- }
-
- /** Constructs a uniform distribution with the given minimum and maximum. */
- public Uniform(double min, double max, Random randomGenerator) {
- setRandomGenerator(randomGenerator);
- setState(min, max);
- }
-
- /** Constructs a uniform distribution with <tt>min=0.0</tt> and <tt>max=1.0</tt>. */
- public Uniform(Random randomGenerator) {
- this(0, 1, randomGenerator);
- }
-
- /** Returns the cumulative distribution function (assuming a continous uniform distribution). */
- @Override
- public double cdf(double x) {
- if (x <= min) {
- return 0.0;
- }
- if (x >= max) {
- return 1.0;
- }
- return (x - min) / (max - min);
- }
-
- /** Returns a uniformly distributed random <tt>boolean</tt>. */
- public boolean nextBoolean() {
- return randomDouble() > 0.5;
- }
-
- /**
- * Returns a uniformly distributed random number in the open interval <tt>(min,max)</tt> (excluding <tt>min</tt> and
- * <tt>max</tt>).
- */
- @Override
- public double nextDouble() {
- return min + (max - min) * randomDouble();
- }
-
- /**
- * Returns a uniformly distributed random number in the open interval <tt>(from,to)</tt> (excluding <tt>from</tt> and
- * <tt>to</tt>). Pre conditions: <tt>from <= to</tt>.
- */
- public double nextDoubleFromTo(double from, double to) {
- return from + (to - from) * randomDouble();
- }
-
- /**
- * Returns a uniformly distributed random number in the open interval <tt>(from,to)</tt> (excluding <tt>from</tt> and
- * <tt>to</tt>). Pre conditions: <tt>from <= to</tt>.
- */
- public float nextFloatFromTo(float from, float to) {
- return (float) nextDoubleFromTo(from, to);
- }
-
- /**
- * Returns a uniformly distributed random number in the closed interval
- * <tt>[from,to]</tt> (including <tt>from</tt>
- * and <tt>to</tt>). Pre conditions: <tt>from <= to</tt>.
- */
- public int nextIntFromTo(int from, int to) {
- return (int) (from + (long) ((1L + to - from) * randomDouble()));
- }
-
- /**
- * Returns a uniformly distributed random number in the closed interval <tt>[from,to]</tt> (including <tt>from</tt>
- * and <tt>to</tt>). Pre conditions: <tt>from <= to</tt>.
- */
- public long nextLongFromTo(long from, long to) {
- /* Doing the thing turns out to be more tricky than expected.
- avoids overflows and underflows.
- treats cases like from=-1, to=1 and the like right.
- the following code would NOT solve the problem: return (long) (Doubles.randomFromTo(from,to));
-
- rounding avoids the unsymmetric behaviour of casts from double to long: (long) -0.7 = 0, (long) 0.7 = 0.
- checking for overflows and underflows is also necessary.
- */
-
- // first the most likely and also the fastest case.
- if (from >= 0 && to < Long.MAX_VALUE) {
- return from + (long) nextDoubleFromTo(0.0, to - from + 1);
- }
-
- // would we get a numeric overflow?
- // if not, we can still handle the case rather efficient.
- double diff = (double) to - (double) from + 1.0;
- if (diff <= Long.MAX_VALUE) {
- return from + (long) nextDoubleFromTo(0.0, diff);
- }
-
- // now the pathologic boundary cases.
- // they are handled rather slow.
- long random;
- if (from == Long.MIN_VALUE) {
- if (to == Long.MAX_VALUE) {
- //return Math.round(nextDoubleFromTo(from,to));
- int i1 = nextIntFromTo(Integer.MIN_VALUE, Integer.MAX_VALUE);
- int i2 = nextIntFromTo(Integer.MIN_VALUE, Integer.MAX_VALUE);
- return ((i1 & 0xFFFFFFFFL) << 32) | (i2 & 0xFFFFFFFFL);
- }
- random = Math.round(nextDoubleFromTo(Long.MIN_VALUE, to + 1));
- if (random > to) {
- random = Long.MIN_VALUE;
- }
- } else {
- random = Math.round(nextDoubleFromTo(from - 1, to));
- if (random < from) {
- random = to;
- }
- }
- return random;
- }
-
- /** Returns the probability distribution function (assuming a continous uniform distribution). */
- @Override
- public double pdf(double x) {
- if (x <= min || x >= max) {
- return 0.0;
- }
- return 1.0 / (max - min);
- }
-
- /** Sets the internal state. */
- public void setState(double min, double max) {
- if (max < min) {
- setState(max, min);
- return;
- }
- this.min = min;
- this.max = max;
- }
-
-
- /** Returns a String representation of the receiver. */
- @Override
- public String toString() {
- return this.getClass().getName() + '(' + min + ',' + max + ')';
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/engine/MersenneTwister.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/engine/MersenneTwister.java b/math/src/main/java/org/apache/mahout/math/jet/random/engine/MersenneTwister.java
deleted file mode 100644
index 8bca895..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/engine/MersenneTwister.java
+++ /dev/null
@@ -1,275 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-package org.apache.mahout.math.jet.random.engine;
-
-import java.util.Date;
-/**
- MersenneTwister (MT19937) is one of the strongest uniform pseudo-random number generators
- known so far; at the same time it is quick.
- Produces uniformly distributed <tt>int</tt>'s and <tt>long</tt>'s in the closed intervals
- <tt>[Integer.MIN_VALUE,Integer.MAX_VALUE]</tt> and <tt>[Long.MIN_VALUE,Long.MAX_VALUE]</tt>, respectively,
- as well as <tt>float</tt>'s and <tt>double</tt>'s in the open unit intervals <tt>(0.0f,1.0f)</tt>
- and <tt>(0.0,1.0)</tt>, respectively.
- The seed can be any 32-bit integer except <tt>0</tt>. Shawn J. Cokus commented that perhaps the
- seed should preferably be odd.
- 
- Quality: MersenneTwister is designed to pass the k-distribution test. It has an
- astronomically large period of 219937-1 (=106001) and 623-dimensional
- equidistribution up to 32-bit accuracy.
- It passes many stringent statistical tests, including the
- <A HREF="http://stat.fsu.edu/~geo/diehard.html">diehard</A> test of G. Marsaglia
- and the load test of P. Hellekalek and S. Wegenkittl.
- 
- Performance: Its speed is comparable to other modern generators (in particular,
- as fast as <tt>java.util.Random.nextFloat()</tt>).
- 2.5 million calls to <tt>raw()</tt> per second (Pentium Pro 200 Mhz, JDK 1.2, NT).
- Be aware, however, that there is a non-negligible amount of overhead required to initialize the data
- structures used by a MersenneTwister. Code like
- {@code
- double sum = 0.0;
- for (int i=0; i<100000; ++i) {
- RandomElement twister = new MersenneTwister(new Date());
- sum += twister.raw();
- }
- }
- will be wildly inefficient. Consider using
- {@code
- double sum = 0.0;
- RandomElement twister = new MersenneTwister(new Date());
- for (int i=0; i<100000; ++i) {
- sum += twister.raw();
- }
- }
- instead. This allows the cost of constructing the MersenneTwister object
- to be borne only once, rather than once for each iteration in the loop.
- 
- Implementation: After M. Matsumoto and T. Nishimura,
- "Mersenne Twister: A 623-Dimensionally Equidistributed Uniform Pseudo-Random Number Generator",
- ACM Transactions on Modeling and Computer Simulation,
- Vol. 8, No. 1, January 1998, pp 3--30.
- <dl>
- <dt>More info on <a HREF="http://www.math.keio.ac.jp/~matumoto/eindex.html"> Masumoto's homepage</a>.</dt>
- <dt>More info on <a HREF="http://www.ncsa.uiuc.edu/Apps/CMP/RNG/www-rng.html"> Pseudo-random number
- generators is on the Web</a>.</dt>
- <dt>Yet <a HREF="http://nhse.npac.syr.edu/random"> some more info</a>.</dt>
- 
- The correctness of this implementation has been verified against the published output sequence
- <a href="http://www.math.keio.ac.jp/~nisimura/random/real2/mt19937-2.out">mt19937-2.out</a> of the C-implementation
- <a href="http://www.math.keio.ac.jp/~nisimura/random/real2/mt19937-2.c">mt19937-2.c</a>.
- (Call <tt>test(1000)</tt> to print the sequence).
- <dt>
- Note that this implementation is not synchronized.</dt>
- </dl>
- 
- Details: MersenneTwister is designed with consideration of the flaws of various existing generators in mind.
- It is an improved version of TT800, a very successful generator.
- MersenneTwister is based on linear recurrences modulo 2.
- Such generators are very fast, have extremely long periods, and appear quite robust.
- MersenneTwister produces 32-bit numbers, and every <tt>k</tt>-dimensional vector of such
- numbers appears the same number of times as <tt>k</tt> successive values over the
- period length, for each <tt>k <= 623</tt> (except for the zero vector, which appears one time less).
- If one looks at only the first <tt>n <= 16</tt> bits of each number, then the property holds
- for even larger <tt>k</tt>, as shown in the following table (taken from the publication cited above):
- <table width="75%" border="1" cellspacing="0" cellpadding="0" summary="property table">
- <tr>
- <td width="2%" align="center"> <div>n</div> </td>
- <td width="6%" align="center"> <div>1</div> </td>
- <td width="5%" align="center"> <div>2</div> </td>
- <td width="5%" align="center"> <div>3</div> </td>
- <td width="5%" align="center"> <div>4</div> </td>
- <td width="5%" align="center"> <div>5</div> </td>
- <td width="5%" align="center"> <div>6</div> </td>
- <td width="5%" align="center"> <div>7</div> </td>
- <td width="5%" align="center"> <div>8</div> </td>
- <td width="5%" align="center"> <div>9</div> </td>
- <td width="5%" align="center"> <div>10</div> </td>
- <td width="5%" align="center"> <div>11</div> </td>
- <td width="10%" align="center"> <div>12 .. 16</div> </td>
- <td width="10%" align="center"> <div>17 .. 32</div> </td>
- </tr>
- <tr>
- <td width="2%" align="center"> <div>k</div> </td>
- <td width="6%" align="center"> <div>19937</div> </td>
- <td width="5%" align="center"> <div>9968</div> </td>
- <td width="5%" align="center"> <div>6240</div> </td>
- <td width="5%" align="center"> <div>4984</div> </td>
- <td width="5%" align="center"> <div>3738</div> </td>
- <td width="5%" align="center"> <div>3115</div> </td>
- <td width="5%" align="center"> <div>2493</div> </td>
- <td width="5%" align="center"> <div>2492</div> </td>
- <td width="5%" align="center"> <div>1869</div> </td>
- <td width="5%" align="center"> <div>1869</div> </td>
- <td width="5%" align="center"> <div>1248</div> </td>
- <td width="10%" align="center"> <div>1246</div> </td>
- <td width="10%" align="center"> <div>623</div> </td>
- </tr>
- </table>
- 
- MersenneTwister generates random numbers in batches of 624 numbers at a time, so
- the caching and pipelining of modern systems is exploited.
- The generator is implemented to generate the output by using the fastest arithmetic
- operations only: 32-bit additions and bit operations (no division, no multiplication, no mod).
- These operations generate sequences of 32 random bits (<tt>int</tt>'s).
- <tt>long</tt>'s are formed by concatenating two 32 bit <tt>int</tt>'s.
- <tt>float</tt>'s are formed by dividing the interval <tt>[0.0,1.0]</tt> into 232
- sub intervals, then randomly choosing one subinterval.
- <tt>double</tt>'s are formed by dividing the interval <tt>[0.0,1.0]</tt> into 264
- sub intervals, then randomly choosing one subinterval.
- 
- @author ***@cern.ch
- @version 1.0, 09/24/99
- @see java.util.Random
- */
-public final class MersenneTwister extends RandomEngine {
-
- /* Period parameters */
- private static final int N = 624;
- private static final int M = 397;
- private static final int MATRIX_A = 0x9908b0df; /* constant vector a */
- private static final int UPPER_MASK = 0x80000000; /* most significant w-r bits */
- private static final int LOWER_MASK = 0x7fffffff; /* least significant r bits */
-
- /* for tempering */
- private static final int TEMPERING_MASK_B = 0x9d2c5680;
- private static final int TEMPERING_MASK_C = 0xefc60000;
-
- private static final int MAG0 = 0x0;
- private static final int MAG1 = MATRIX_A;
- //private static final int[] mag01=new int[] {0x0, MATRIX_A};
- /* mag01[x] = x * MATRIX_A for x=0,1 */
-
- private static final int DEFAULT_SEED = 4357;
-
- private int mti;
- private final int[] mt = new int[N]; /* set initial seeds: N = 624 words */
-
- /**
- * Constructs and returns a random number generator with a default seed, which is a constant. Thus using this
- * constructor will yield generators that always produce exactly the same sequence. This method is mainly intended to
- * ease testing and debugging.
- */
- public MersenneTwister() {
- this(DEFAULT_SEED);
- }
-
- /** Constructs and returns a random number generator with the given seed.
- * @param seed A number that is used to initialize the internal state of the generator.
- */
- public MersenneTwister(int seed) {
- setSeed(seed);
- }
-
- /**
- * Constructs and returns a random number generator seeded with the given date.
- *
- * @param d typically <tt>new Date()</tt>
- */
- public MersenneTwister(Date d) {
- this((int) d.getTime());
- }
-
- /** Generates N words at one time */
- void nextBlock() {
- int y;
- int kk;
-
- for (kk = 0; kk < N - M; kk++) {
- y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
- mt[kk] = mt[kk + M] ^ (y >>> 1) ^ ((y & 0x1) == 0 ? MAG0 : MAG1);
- }
- for (; kk < N - 1; kk++) {
- y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
- mt[kk] = mt[kk + (M - N)] ^ (y >>> 1) ^ ((y & 0x1) == 0 ? MAG0 : MAG1);
- }
- y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
- mt[N - 1] = mt[M - 1] ^ (y >>> 1) ^ ((y & 0x1) == 0 ? MAG0 : MAG1);
-
- this.mti = 0;
- }
-
- /**
- * Returns a 32 bit uniformly distributed random number in the closed interval
- * <tt>[Integer.MIN_VALUE,Integer.MAX_VALUE]</tt>
- * (including <tt>Integer.MIN_VALUE</tt> and <tt>Integer.MAX_VALUE</tt>).
- */
- @Override
- public int nextInt() {
- /* Each single bit including the sign bit will be random */
- if (mti == N) {
- nextBlock();
- } // generate N ints at one time
-
- int y = mt[mti++];
- y ^= y >>> 11; // y ^= TEMPERING_SHIFT_U(y );
- y ^= (y << 7) & TEMPERING_MASK_B; // y ^= TEMPERING_SHIFT_S(y) & TEMPERING_MASK_B;
- y ^= (y << 15) & TEMPERING_MASK_C; // y ^= TEMPERING_SHIFT_T(y) & TEMPERING_MASK_C;
- // y &= 0xffffffff; //you may delete this line if word size = 32
- y ^= y >>> 18; // y ^= TEMPERING_SHIFT_L(y);
-
- return y;
- }
-
- /** Sets the receiver's seed. This method resets the receiver's entire internal state.
- * @param seed An integer that is used to reset the internal state of the generator */
- void setSeed(int seed) {
- mt[0] = seed;
- for (int i = 1; i < N; i++) {
- mt[i] = 1812433253 * (mt[i - 1] ^ (mt[i - 1] >> 30)) + i;
- /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
- /* In the previous versions, MSBs of the seed affect */
- /* only MSBs of the array mt[]. */
- /* 2002/01/09 modified by Makoto Matsumoto */
- //mt[i] &= 0xffffffff;
- /* for >32 bit machines */
- }
- //log.info("init done");
- mti = N;
- }
-
- /**
- * Sets the receiver's seed in a fashion compatible with the
- * reference C implementation. See
- * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/VERSIONS/C-LANG/980409/mt19937int.c
- *
- * This method isn't as good as the default method due to poor distribution of the
- * resulting states.
- *
- * @param seed An integer that is used to reset the internal state in the same way as
- * done in the 1999 reference implementation. Should only be used for testing, not
- * actual coding.
- */
- void setReferenceSeed(int seed) {
- for (int i = 0; i < N; i++) {
- mt[i] = seed & 0xffff0000;
- seed = 69069 * seed + 1;
- mt[i] |= (seed & 0xffff0000) >>> 16;
- seed = 69069 * seed + 1;
- }
- //log.info("init done");
- mti = N;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/engine/RandomEngine.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/engine/RandomEngine.java b/math/src/main/java/org/apache/mahout/math/jet/random/engine/RandomEngine.java
deleted file mode 100644
index f832b1d..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/engine/RandomEngine.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random.engine;
-
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.IntFunction;
-
-/**
- * Abstract base class for uniform pseudo-random number generating engines.
- * 
- * Most probability distributions are obtained by using a uniform pseudo-random number generation engine
- * followed by a transformation to the desired distribution.
- * Thus, subclasses of this class are at the core of computational statistics, simulations, Monte Carlo methods, etc.
- * 
- * Subclasses produce uniformly distributed <tt>int</tt>'s and <tt>long</tt>'s in the closed intervals
- * <tt>[Integer.MIN_VALUE,Integer.MAX_VALUE]</tt> and <tt>[Long.MIN_VALUE,Long.MAX_VALUE]</tt>, respectively,
- * as well as <tt>float</tt>'s and <tt>double</tt>'s in the open unit intervals <tt>(0.0f,1.0f)</tt> and
- * <tt>(0.0,1.0)</tt>, respectively.
- * 
- * Subclasses need to override one single method only: <tt>nextInt()</tt>.
- * All other methods generating different data types or ranges are usually layered upon <tt>nextInt()</tt>.
- * <tt>long</tt>'s are formed by concatenating two 32 bit <tt>int</tt>'s.
- * <tt>float</tt>'s are formed by dividing the interval <tt>[0.0f,1.0f]</tt> into 232 sub intervals,
- * then randomly choosing one subinterval.
- * <tt>double</tt>'s are formed by dividing the interval <tt>[0.0,1.0]</tt> into 264 sub intervals,
- * then randomly choosing one subinterval.
- * 
- * Note that this implementation is not synchronized.
- *
- * @see MersenneTwister
- * @see java.util.Random
- */
-public abstract class RandomEngine extends DoubleFunction implements IntFunction {
-
- /**
- * Equivalent to <tt>raw()</tt>. This has the effect that random engines can now be used as function objects,
- * returning a random number upon function evaluation.
- */
- @Override
- public double apply(double dummy) {
- return raw();
- }
-
- /**
- * Equivalent to <tt>nextInt()</tt>. This has the effect that random engines can now be used as function objects,
- * returning a random number upon function evaluation.
- */
- @Override
- public int apply(int dummy) {
- return nextInt();
- }
-
- /**
- * @return a 64 bit uniformly distributed random number in the open unit interval {@code (0.0,1.0)} (excluding
- * 0.0 and 1.0).
- */
- public double nextDouble() {
- double nextDouble;
-
- do {
- // -9.223372036854776E18 == (double) Long.MIN_VALUE
- // 5.421010862427522E-20 == 1 / Math.pow(2,64) == 1 / ((double) Long.MAX_VALUE - (double) Long.MIN_VALUE);
- nextDouble = (nextLong() - -9.223372036854776E18) * 5.421010862427522E-20;
- }
- // catch loss of precision of long --> double conversion
- while (!(nextDouble > 0.0 && nextDouble < 1.0));
-
- // --> in (0.0,1.0)
- return nextDouble;
-
- /*
- nextLong == Long.MAX_VALUE --> 1.0
- nextLong == Long.MIN_VALUE --> 0.0
- nextLong == Long.MAX_VALUE-1 --> 1.0
- nextLong == Long.MAX_VALUE-100000L --> 0.9999999999999946
- nextLong == Long.MIN_VALUE+1 --> 0.0
- nextLong == Long.MIN_VALUE-100000L --> 0.9999999999999946
- nextLong == 1L --> 0.5
- nextLong == -1L --> 0.5
- nextLong == 2L --> 0.5
- nextLong == -2L --> 0.5
- nextLong == 2L+100000L --> 0.5000000000000054
- nextLong == -2L-100000L --> 0.49999999999999456
- */
- }
-
- /**
- * @return a 32 bit uniformly distributed random number in the open unit interval {@code (0.0f, 1.0f)} (excluding
- * 0.0f and 1.0f).
- */
- public float nextFloat() {
- // catch loss of precision of double --> float conversion which could result in a value == 1.0F
- float nextFloat;
- do {
- nextFloat = (float) raw();
- }
- while (nextFloat >= 1.0f);
-
- // --> in [0.0f,1.0f)
- return nextFloat;
- }
-
- /**
- * @return a 32 bit uniformly distributed random number in the closed interval
- * <tt>[Integer.MIN_VALUE,Integer.MAX_VALUE]</tt>
- * (including <tt>Integer.MIN_VALUE</tt> and <tt>Integer.MAX_VALUE</tt>);
- */
- public abstract int nextInt();
-
- /**
- * @return a 64 bit uniformly distributed random number in the closed interval
- * <tt>[Long.MIN_VALUE,Long.MAX_VALUE]</tt>
- * (including <tt>Long.MIN_VALUE</tt> and <tt>Long.MAX_VALUE</tt>).
- */
- public long nextLong() {
- // concatenate two 32-bit strings into one 64-bit string
- return ((nextInt() & 0xFFFFFFFFL) << 32) | (nextInt() & 0xFFFFFFFFL);
- }
-
- /**
- * @return a 32 bit uniformly distributed random number in the open unit interval {@code (0.0, 1.0)} (excluding
- * 0.0 and 1.0).
- */
- public double raw() {
- int nextInt;
- do { // accept anything but zero
- nextInt = nextInt(); // in [Integer.MIN_VALUE,Integer.MAX_VALUE]-interval
- } while (nextInt == 0);
-
- // transform to (0.0,1.0)-interval
- // 2.3283064365386963E-10 == 1.0 / Math.pow(2,32)
- return (nextInt & 0xFFFFFFFFL) * 2.3283064365386963E-10;
-
- /*
- nextInt == Integer.MAX_VALUE --> 0.49999999976716936
- nextInt == Integer.MIN_VALUE --> 0.5
- nextInt == Integer.MAX_VALUE-1 --> 0.4999999995343387
- nextInt == Integer.MIN_VALUE+1 --> 0.5000000002328306
- nextInt == 1 --> 2.3283064365386963E-10
- nextInt == -1 --> 0.9999999997671694
- nextInt == 2 --> 4.6566128730773926E-10
- nextInt == -2 --> 0.9999999995343387
- */
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/engine/package-info.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/engine/package-info.java b/math/src/main/java/org/apache/mahout/math/jet/random/engine/package-info.java
deleted file mode 100644
index e092010..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/engine/package-info.java
+++ /dev/null
@@ -1,7 +0,0 @@
-/**
- * Engines generating strong uniformly distributed pseudo-random numbers;
- * Needed by all JET probability distributions since they rely on uniform random numbers to generate random
- * numbers from their own distribution.
- * Thus, the classes of this package are at the core of computational statistics, simulations, Monte Carlo methods, etc.
- */
-package org.apache.mahout.math.jet.random.engine;

r***@apache.org

2018-06-27 14:51:34 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/Functions.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/Functions.java b/math/src/main/java/org/apache/mahout/math/function/Functions.java
deleted file mode 100644
index f08c328..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/Functions.java
+++ /dev/null
@@ -1,1730 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-package org.apache.mahout.math.function;
-
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.math.jet.random.engine.MersenneTwister;
-
-import java.util.Date;
-
-
-/**
- * Function objects to be passed to generic methods. Contains the functions of {@link java.lang.Math} as function
- * objects, as well as a few more basic functions. Function objects conveniently allow to express arbitrary functions
- * in a generic manner. Essentially, a function object is an object that can perform a function on some arguments. It
- * has a minimal interface: a method <tt>apply</tt> that takes the arguments, computes something and returns some result
- * value. Function objects are comparable to function pointers in C used for call-backs. Unary functions are of type
- * {@link org.apache.mahout.math.function.DoubleFunction}, binary functions of type {@link
- * org.apache.mahout.math.function.DoubleDoubleFunction}. All can be retrieved via <tt>public static final</tt>
- * variables named after the function. Unary predicates are of type
- * {@link DoubleProcedure},
- * binary predicates of type {@link org.apache.mahout.math.function.DoubleDoubleProcedure}. All can be retrieved via
- * <tt>public static final</tt> variables named <tt>isXXX</tt>.
- *
- * Binary functions and predicates also exist as unary functions with the second argument being fixed to a constant.
- * These are generated and retrieved via factory methods (again with the same name as the function). Example: <ul>
- * <li><tt>Functions.pow</tt> gives the function <tt>ab</tt>. <li><tt>Functions.pow.apply(2,3)==8</tt>.
- * <li><tt>Functions.pow(3)</tt> gives the function <tt>a3</tt>. <li><tt>Functions.pow(3).apply(2)==8</tt>.
- * </ul> More general, any binary function can be made an unary functions by fixing either the first or the second
- * argument. See methods {@link #bindArg1(org.apache.mahout.math.function.DoubleDoubleFunction ,double)} and {@link
- * #bindArg2(org.apache.mahout.math.function.DoubleDoubleFunction ,double)}. The order of arguments can
- * be swapped so that the first argument becomes the
- * second and vice-versa. See method {@link #swapArgs(org.apache.mahout.math.function.DoubleDoubleFunction)}.
- * Example: <ul> <li><tt>Functions.pow</tt>
- * gives the function <tt>ab</tt>. <li><tt>Functions.bindArg2(Functions.pow,3)</tt> gives the function
- * <tt>x3</tt>. <li><tt>Functions.bindArg1(Functions.pow,3)</tt> gives the function <tt>3x</tt>.
- * <li><tt>Functions.swapArgs(Functions.pow)</tt> gives the function <tt>ba</tt>. </ul> Even more
- * general, functions can be chained (composed, assembled). Assume we have two unary functions <tt>g</tt> and
- * <tt>h</tt>. The unary function <tt>g(h(a))</tt> applying both in sequence can be generated via {@link
- * #chain(org.apache.mahout.math.function.DoubleFunction , org.apache.mahout.math.function.DoubleFunction)}:
- * <ul> <li><tt>Functions.chain(g,h);</tt> </ul> Assume further we have a binary
- * function <tt>f</tt>. The binary function <tt>g(f(a,b))</tt> can be generated via {@link
- * #chain(org.apache.mahout.math.function.DoubleFunction , org.apache.mahout.math.function.DoubleDoubleFunction)}:
- * <ul> <li><tt>Functions.chain(g,f);</tt> </ul> The binary function
- * <tt>f(g(a),h(b))</tt> can be generated via
- * {@link #chain(org.apache.mahout.math.function.DoubleDoubleFunction , org.apache.mahout.math.function.DoubleFunction ,
- * org.apache.mahout.math.function.DoubleFunction)}: <ul>
- * <li><tt>Functions.chain(f,g,h);</tt> </ul> Arbitrarily complex functions can be composed from these building blocks.
- * For example <tt>sin(a) + cos2(b)</tt> can be specified as follows: <ul>
- * <li><tt>chain(plus,sin,chain(square,cos));</tt> </ul> or, of course, as
- * <pre>
- * new DoubleDoubleFunction() {
- *    public final double apply(double a, double b) { return Math.sin(a) + Math.pow(Math.cos(b),2); }
- * }
- * </pre>
- * For aliasing see functions. Try this <table> <tr><td class="PRE">
- * <pre>
- * // should yield 1.4399560356056456 in all cases
- * double a = 0.5;
- * double b = 0.2;
- * double v = Math.sin(a) + Math.pow(Math.cos(b),2);
- * log.info(v);
- * Functions F = Functions.functions;
- * DoubleDoubleFunction f = F.chain(F.plus,F.sin,F.chain(F.square,F.cos));
- * log.info(f.apply(a,b));
- * DoubleDoubleFunction g = new DoubleDoubleFunction() {
- *    public double apply(double a, double b) { return Math.sin(a) + Math.pow(Math.cos(b),2); }
- * };
- * log.info(g.apply(a,b));
- * </pre>
- * </td></tr></table>
- *
- * <H3>Performance</H3>
- *
- * Surprise. Using modern non-adaptive JITs such as SunJDK 1.2.2 (java -classic) there seems to be no or only moderate
- * performance penalty in using function objects in a loop over traditional code in a loop. For complex nested function
- * objects (e.g. <tt>F.chain(F.abs,F.chain(F.plus,F.sin,F.chain(F.square,F.cos)))</tt>) the penalty is zero, for trivial
- * functions (e.g. <tt>F.plus</tt>) the penalty is often acceptable. <center> <table border cellpadding="3"
- * cellspacing="0" align="center">
- * <tr valign="middle" bgcolor="#33CC66" align="center"> <td nowrap colspan="7">
- * Iteration Performance [million function evaluations per second] Pentium
- * Pro 200 Mhz, SunJDK 1.2.2, NT, java -classic, </td> </tr>
- * <tr valign="middle" bgcolor="#66CCFF" align="center"> <td nowrap bgcolor="#FF9966" rowspan="2"> </td> <td bgcolor="#FF9966" colspan="2"> 30000000
- * iterations </td> <td bgcolor="#FF9966" colspan="2"> 3000000 iterations (10 times less)</td> <td bgcolor="#FF9966"
- * colspan="2"> </td> </tr>
- * <tr valign="middle" bgcolor="#66CCFF" align="center"> <td nowrap bgcolor="#FF9966">
- * <tt>F.plus</tt></td> <td bgcolor="#FF9966"><tt>a+b</tt></td> <td bgcolor="#FF9966">
- * <tt>F.chain(F.abs,F.chain(F.plus,F.sin,F.chain(F.square,F.cos)))</tt></td> <td bgcolor="#FF9966">
- * <tt>Math.abs(Math.sin(a) + Math.pow(Math.cos(b),2))</tt></td> <td bgcolor="#FF9966"> </td> <td
- * bgcolor="#FF9966"> </td> </tr>
- * <tr valign="middle" bgcolor="#66CCFF" align="center"> <td nowrap
- * bgcolor="#FF9966"> </td> <td nowrap>10.8</td> <td nowrap>29.6</td> <td nowrap>0.43</td> <td nowrap>0.35</td> <td
- * nowrap> </td> <td nowrap> </td> </tr>
- * </table></center>
- */
-public final class Functions {
-
- /*
- * <H3>Unary functions</H3>
- */
- /** Function that returns <tt>Math.abs(a)</tt>. */
- public static final DoubleFunction ABS = new DoubleFunction() {
- @Override
- public double apply(double a) {
- return Math.abs(a);
- }
- };
-
- /** Function that returns <tt>Math.acos(a)</tt>. */
- public static final DoubleFunction ACOS = new DoubleFunction() {
- @Override
- public double apply(double a) {
- return Math.acos(a);
- }
- };
-
- /** Function that returns <tt>Math.asin(a)</tt>. */
- public static final DoubleFunction ASIN = new DoubleFunction() {
- @Override
- public double apply(double a) {
- return Math.asin(a);
- }
- };
-
- /** Function that returns <tt>Math.atan(a)</tt>. */
- public static final DoubleFunction ATAN = new DoubleFunction() {
- @Override
- public double apply(double a) {
- return Math.atan(a);
- }
- };
-
- /** Function that returns <tt>Math.ceil(a)</tt>. */
- public static final DoubleFunction CEIL = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.ceil(a);
- }
- };
-
- /** Function that returns <tt>Math.cos(a)</tt>. */
- public static final DoubleFunction COS = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.cos(a);
- }
- };
-
- /** Function that returns <tt>Math.exp(a)</tt>. */
- public static final DoubleFunction EXP = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.exp(a);
- }
- };
-
- /** Function that returns <tt>Math.floor(a)</tt>. */
- public static final DoubleFunction FLOOR = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.floor(a);
- }
- };
-
- /** Function that returns its argument. */
- public static final DoubleFunction IDENTITY = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a;
- }
- };
-
- /** Function that returns <tt>1.0 / a</tt>. */
- public static final DoubleFunction INV = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return 1.0 / a;
- }
- };
-
- /** Function that returns <tt>Math.log(a)</tt>. */
- public static final DoubleFunction LOGARITHM = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.log(a);
- }
- };
-
- /** Function that returns <tt>Math.log(a) / Math.log(2)</tt>. */
- public static final DoubleFunction LOG2 = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.log(a) * 1.4426950408889634;
- }
- };
-
- /** Function that returns <tt>-a</tt>. */
- public static final DoubleFunction NEGATE = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return -a;
- }
- };
-
- /** Function that returns <tt>Math.rint(a)</tt>. */
- public static final DoubleFunction RINT = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.rint(a);
- }
- };
-
- /**
- * Function that returns {@code a < 0 ? -1 : a > 0 ? 1 : 0}.
- */
- public static final DoubleFunction SIGN = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a < 0 ? -1 : a > 0 ? 1 : 0;
- }
- };
-
- /** Function that returns <tt>Math.sin(a)</tt>. */
- public static final DoubleFunction SIN = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.sin(a);
- }
- };
-
- /** Function that returns <tt>Math.sqrt(a)</tt>. */
- public static final DoubleFunction SQRT = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.sqrt(a);
- }
- };
-
- /** Function that returns <tt>a * a</tt>. */
- public static final DoubleFunction SQUARE = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a * a;
- }
- };
-
- /** Function that returns <tt> 1 / (1 + exp(-a) </tt> */
- public static final DoubleFunction SIGMOID = new DoubleFunction() {
- @Override
- public double apply(double a) {
- return 1.0 / (1.0 + Math.exp(-a));
- }
- };
-
- /** Function that returns <tt> a * (1-a) </tt> */
- public static final DoubleFunction SIGMOIDGRADIENT = new DoubleFunction() {
- @Override
- public double apply(double a) {
- return a * (1.0 - a);
- }
- };
-
- /** Function that returns <tt>Math.tan(a)</tt>. */
- public static final DoubleFunction TAN = new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.tan(a);
- }
- };
-
- /*
- * <H3>Binary functions</H3>
- */
-
- /** Function that returns <tt>Math.atan2(a,b)</tt>. */
- public static final DoubleDoubleFunction ATAN2 = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.atan2(a, b);
- }
- };
-
- /**
- * Function that returns <tt>a b ? 1 : 0</tt>.
- */
- public static final DoubleDoubleFunction COMPARE = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return a b ? 1 : 0;
- }
- };
-
- /** Function that returns <tt>a / b</tt>. */
- public static final DoubleDoubleFunction DIV = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return a / b;
- }
-
- /**
- * x / 0 = infinity or undefined depending on x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * 0 / y = 0 unless y = 0
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * x / 0 = infinity or undefined depending on x
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * x / y != y / x
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return false;
- }
-
- /**
- * x / (y / z) = x * z / y
- * (x / y) / z = x / (y * z)
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
-
- };
-
- /** Function that returns <tt>a == b ? 1 : 0</tt>. */
- public static final DoubleDoubleFunction EQUALS = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return a == b ? 1 : 0;
- }
-
- /**
- * x = y iff y = x
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
- };
-
- /**
- * Function that returns <tt>a > b ? 1 : 0</tt>.
- */
- public static final DoubleDoubleFunction GREATER = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return a > b ? 1 : 0;
- }
- };
-
- /** Function that returns <tt>Math.IEEEremainder(a,b)</tt>. */
- public static final DoubleDoubleFunction IEEE_REMAINDER = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.IEEEremainder(a, b);
- }
- };
-
- /** Function that returns <tt>a == b</tt>. */
- public static final DoubleDoubleProcedure IS_EQUAL = new DoubleDoubleProcedure() {
-
- @Override
- public boolean apply(double a, double b) {
- return a == b;
- }
- };
-
- /**
- * Function that returns {@code a < b}.
- */
- public static final DoubleDoubleProcedure IS_LESS = new DoubleDoubleProcedure() {
-
- @Override
- public boolean apply(double a, double b) {
- return a < b;
- }
- };
-
- /**
- * Function that returns {@code a > b}.
- */
- public static final DoubleDoubleProcedure IS_GREATER = new DoubleDoubleProcedure() {
-
- @Override
- public boolean apply(double a, double b) {
- return a > b;
- }
- };
-
- /**
- * Function that returns <tt>a .
- */
- public static final DoubleDoubleFunction LESS = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return a Math.log(a) / Math.log(b)</tt>. */
- public static final DoubleDoubleFunction LG = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.log(a) / Math.log(b);
- }
- };
-
- /** Function that returns <tt>Math.max(a,b)</tt>. */
- public static final DoubleDoubleFunction MAX = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.max(a, b);
- }
-
- /**
- * max(x, 0) = x or 0 depending on the sign of x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * max(0, y) = y or 0 depending on the sign of y
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * max(x, 0) = x or 0 depending on the sign of x
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * max(x, max(y, z)) = max(max(x, y), z)
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return true;
- }
-
- /**
- * max(x, y) = max(y, x)
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
- };
-
- public static final DoubleDoubleFunction MAX_ABS = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.max(Math.abs(a), Math.abs(b));
- }
-
- /**
- * max(|x|, 0) = |x|
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return true;
- }
-
- /**
- * max(0, |y|) = |y|
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * max(|x|, 0) = |x|
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * max(|x|, max(|y|, |z|)) = max(max(|x|, |y|), |z|)
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return true;
- }
-
- /**
- * max(|x|, |y|) = max(|y\, |x\)
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
- };
-
- /** Function that returns <tt>Math.min(a,b)</tt>. */
- public static final DoubleDoubleFunction MIN = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.min(a, b);
- }
-
- /**
- * min(x, 0) = x or 0 depending on the sign of x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * min(0, y) = y or 0 depending on the sign of y
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * min(x, 0) = x or 0 depending on the sign of x
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * min(x, min(y, z)) = min(min(x, y), z)
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return true;
- }
-
- /**
- * min(x, y) = min(y, x)
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
- };
-
- /** Function that returns <tt>a - b</tt>. */
- public static final DoubleDoubleFunction MINUS = plusMult(-1);
-
- public static final DoubleDoubleFunction MINUS_SQUARED = new DoubleDoubleFunction() {
- @Override
- public double apply(double x, double y) {
- return (x - y) * (x - y);
- }
-
- /**
- * (x - 0)^2 = x^2 != x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * (0 - y)^2 != 0
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * (x - 0)^2 != x
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * (x - y)^2 = (y - x)^2
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
-
- /**
- * (x - (y - z)^2)^2 != ((x - y)^2 - z)^2
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
-
- /** Function that returns <tt>a % b</tt>. */
- public static final DoubleDoubleFunction MOD = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return a % b;
- }
- };
-
- /** Function that returns <tt>a * b</tt>. */
- public static final DoubleDoubleFunction MULT = new TimesFunction();
-
- /** Function that returns <tt>a + b</tt>. */
- public static final DoubleDoubleFunction PLUS = plusMult(1);
-
- /** Function that returns <tt>Math.abs(a) + Math.abs(b)</tt>. */
- public static final DoubleDoubleFunction PLUS_ABS = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.abs(a) + Math.abs(b);
- }
-
- /**
- * abs(x) + abs(0) = abs(x) != x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * abs(0) + abs(y) = abs(y) != 0 unless y = 0
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * abs(x) + abs(0) = abs(x) != 0 unless x = 0
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * abs(x) + abs(abs(y) + abs(z)) = abs(x) + abs(y) + abs(z)
- * abs(abs(x) + abs(y)) + abs(z) = abs(x) + abs(y) + abs(z)
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return true;
- }
-
- /**
- * abs(x) + abs(y) = abs(y) + abs(x)
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
- };
-
- public static final DoubleDoubleFunction MINUS_ABS = new DoubleDoubleFunction() {
- @Override
- public double apply(double x, double y) {
- return Math.abs(x - y);
- }
-
- /**
- * |x - 0| = |x|
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * |0 - y| = |y|
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * |x - 0| = |x|
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * |x - y| = |y - x|
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
-
- /**
- * |x - |y - z|| != ||x - y| - z| (|5 - |4 - 3|| = 1; ||5 - 4| - 3| = |1 - 3| = 2)
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
-
- /** Function that returns <tt>Math.pow(a,b)</tt>. */
- public static final DoubleDoubleFunction POW = new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return Math.pow(a, b);
- }
-
- /**
- * x^0 = 1 for any x unless x = 0 (undefined)
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * 0^y = 0 for any y unless y = 0 (undefined, but Math.pow(0, 0) = 1)
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * x^0 = 1 for any x (even x = 0)
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * x^y != y^x (2^3 != 3^2)
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return false;
- }
-
- /**
- * x^(y^z) != (x^y)^z ((2^3)^4 = 8^4 = 2^12 != 2^(3^4) = 2^81)
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
-
- public static final DoubleDoubleFunction SECOND = new DoubleDoubleFunction() {
- @Override
- public double apply(double x, double y) {
- return y;
- }
-
- /**
- * f(x, 0) = x for any x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * f(0, y) = y for any y
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * f(x, 0) = 0 for any x
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return true;
- }
-
- /**
- * f(x, y) = x != y = f(y, x) for any x, y unless x = y
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return false;
- }
-
- /**
- * f(x, f(y, z)) = f(x, z) = z
- * f(f(x, y), z) = z
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return true;
- }
- };
-
- /**
- * This function is specifically designed to be used when assigning a vector to one that is all zeros (created
- * by like()). It enables iteration only through the nonzeros of the right hand side by declaring isLikeRightPlus
- * to be true. This is NOT generally true for SECOND (hence the other function above).
- */
- public static final DoubleDoubleFunction SECOND_LEFT_ZERO = new DoubleDoubleFunction() {
- @Override
- public double apply(double x, double y) {
- Preconditions.checkArgument(x == 0, "This special version of SECOND needs x == 0");
- return y;
- }
-
- /**
- * f(x, 0) = 0 for any x; we're only assigning to left hand sides that are strictly 0
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return true;
- }
-
- /**
- * f(0, y) = y for any y
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * f(x, 0) = 0 for any x
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return true;
- }
-
- /**
- * f(x, y) = x != y = f(y, x) for any x, y unless x = y
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return false;
- }
-
- /**
- * f(x, f(y, z)) = f(x, z) = z
- * f(f(x, y), z) = z
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return true;
- }
- };
- public static final DoubleDoubleFunction MULT_SQUARE_LEFT = new DoubleDoubleFunction() {
- @Override
- public double apply(double x, double y) {
- return x * x * y;
- }
-
- /**
- * x * x * 0 = 0
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * 0 * 0 * y = 0
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return true;
- }
-
- /**
- * x * x * 0 = 0
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return true;
- }
-
- /**
- * x * x * y != y * y * x
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return false;
- }
-
- /**
- * x * x * y * y * z != x * x * y * x * x * y * z
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
-
- public static final DoubleDoubleFunction MULT_RIGHT_PLUS1 = new DoubleDoubleFunction() {
-
- /**
- * Apply the function to the arguments and return the result
- *
- * @param x a double for the first argument
- * @param y a double for the second argument
- * @return the result of applying the function
- */
- @Override
- public double apply(double x, double y) {
- return x * (y + 1);
- }
-
- /**
- * x * 1 = x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return true;
- }
-
- /**
- * 0 * y = 0
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return true;
- }
-
- /**
- * x * 1 = x != 0
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * x * (y + 1) != y * (x + 1)
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return false;
- }
-
- /**
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
-
- public static DoubleDoubleFunction reweigh(final double wx, final double wy) {
- final double tw = wx + wy;
- return new DoubleDoubleFunction() {
- @Override
- public double apply(double x, double y) {
- return (wx * x + wy * y) / tw;
- }
-
- /**
- * f(x, 0) = wx * x / tw = x iff wx = tw (practically, impossible, as tw = wx + wy and wy > 0)
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return wx == tw;
- }
-
- /**
- * f(0, y) = wy * y / tw = 0 iff y = 0
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * f(x, 0) = wx * x / tw = 0 iff x = 0
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * wx * x + wy * y = wx * y + wy * x iff wx = wy
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return wx == wy;
- }
-
- /**
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
- }
-
- private Functions() {
- }
-
- /**
- * Constructs a function that returns {@code (from<=a && a<=to) ? 1 : 0}.
- * <tt>a</tt> is a variable, <tt>from</tt> and <tt>to</tt> are fixed.
- */
- public static DoubleFunction between(final double from, final double to) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return from <= a && a <= to ? 1 : 0;
- }
- };
- }
-
- /**
- * Constructs a unary function from a binary function with the first operand (argument) fixed to the given constant
- * <tt>c</tt>. The second operand is variable (free).
- *
- * @param function a binary function taking operands in the form <tt>function.apply(c,var)</tt>.
- * @return the unary function <tt>function(c,var)</tt>.
- */
- public static DoubleFunction bindArg1(final DoubleDoubleFunction function, final double c) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double var) {
- return function.apply(c, var);
- }
- };
- }
-
- /**
- * Constructs a unary function from a binary function with the second operand (argument) fixed to the given constant
- * <tt>c</tt>. The first operand is variable (free).
- *
- * @param function a binary function taking operands in the form <tt>function.apply(var,c)</tt>.
- * @return the unary function <tt>function(var,c)</tt>.
- */
- public static DoubleFunction bindArg2(final DoubleDoubleFunction function, final double c) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double var) {
- return function.apply(var, c);
- }
- };
- }
-
- /**
- * Constructs the function <tt>f( g(a), h(b) )</tt>.
- *
- * @param f a binary function.
- * @param g a unary function.
- * @param h a unary function.
- * @return the binary function <tt>f( g(a), h(b) )</tt>.
- */
- public static DoubleDoubleFunction chain(final DoubleDoubleFunction f, final DoubleFunction g,
- final DoubleFunction h) {
- return new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return f.apply(g.apply(a), h.apply(b));
- }
-
- /**
- * fx(c, 0) = f(g(x), h(0)) = f(g(x), 0) = g(x) = x if h(0) = 0 and f isLikeRightPlus and g(x) = x
- * Impossible to check whether g(x) = x for any x, so we return false.
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * fc(0, y) = f(g(0), h(y)) = f(0, h(y)) = 0 if g(0) = 0 and f isLikeLeftMult
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return g.apply(0) == 0 && f.isLikeLeftMult();
- }
-
- /**
- * fc(x, 0) = f(g(x), h(0)) = f(g(x), 0) = 0 if h(0) = 0 and f isLikeRightMult
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return h.apply(0) == 0 && f.isLikeRightMult();
- }
-
- /**
- * fc(x, y) = f(g(x), h(y)) = f(h(y), g(x))
- * fc(y, x) = f(g(y), h(x)) = f(h(x), g(y))
- * Either g(x) = g(y) for any x, y and h(x) = h(y) for any x, y or g = h and f isCommutative.
- * Can only check if g = h (reference equality, assuming they're both the same static function in
- * this file) and f isCommutative. There are however other scenarios when this might happen that are NOT
- * covered by this definition.
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return g.equals(h) && f.isCommutative();
- }
-
- /**
- * fc(x, fc(y, z)) = f(g(x), h(f(g(y), h(z))))
- * fc(fc(x, y), z) = f(g(f(g(x), h(y))), h(z))
- * Impossible to check.
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
- }
-
- /**
- * Constructs the function <tt>g( h(a,b) )</tt>.
- *
- * @param g a unary function.
- * @param h a binary function.
- * @return the binary function <tt>g( h(a,b) )</tt>.
- */
- public static DoubleDoubleFunction chain(final DoubleFunction g, final DoubleDoubleFunction h) {
- return new DoubleDoubleFunction() {
-
- @Override
- public double apply(double a, double b) {
- return g.apply(h.apply(a, b));
- }
-
- /**
- * g(h(x, 0)) = g(x) = x for any x iff g(x) = x and h isLikeRightPlus
- * Impossible to check.
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * g(h(0, y)) = g(0) = 0 for any y iff g(0) = 0 and h isLikeLeftMult
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return !g.isDensifying() && h.isLikeLeftMult();
- }
-
- /**
- * g(h(x, 0)) = g(0) = 0 for any x iff g(0) = 0 and h isLikeRightMult
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return !g.isDensifying() && h.isLikeRightMult();
- }
-
- /**
- * fc(x, y) = g(h(x, y)) = g(h(y, x)) = fc(y, x) iff h isCommutative
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return h.isCommutative();
- }
-
- /**
- * fc(x, fc(y, z)) = g(h(x, g(h(y, z)))
- * fc(fc(x, y), z) = g(h(g(h(x, y)), z))
- * Impossible to check.
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
- }
-
- /**
- * Constructs the function <tt>g( h(a) )</tt>.
- *
- * @param g a unary function.
- * @param h a unary function.
- * @return the unary function <tt>g( h(a) )</tt>.
- */
- public static DoubleFunction chain(final DoubleFunction g, final DoubleFunction h) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return g.apply(h.apply(a));
- }
- };
- }
-
- /**
- * Constructs the function <tt>g( h(a) )</tt>.
- *
- * @param g a unary function.
- * @param h an {@link IntIntFunction} function.
- * @return the unary function <tt>g( h(a) )</tt>.
- */
- public static IntIntFunction chain(final DoubleFunction g, final IntIntFunction h) {
- return new IntIntFunction() {
-
- @Override
- public double apply(int first, int second) {
- return g.apply(h.apply(first, second));
- }
- };
- }
-
-
- /**
- * Constructs a function that returns {@code a b ? 1 : 0}. <tt>a</tt> is a variable, <tt>b</tt> is
- * fixed.
- */
- public static DoubleFunction compare(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a b ? 1 : 0;
- }
- };
- }
-
- /** Constructs a function that returns the constant <tt>c</tt>. */
- public static DoubleFunction constant(final double c) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return c;
- }
- };
- }
-
-
- /** Constructs a function that returns <tt>a / b</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction div(double b) {
- return mult(1 / b);
- }
-
- /** Constructs a function that returns <tt>a == b ? 1 : 0</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction equals(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a == b ? 1 : 0;
- }
- };
- }
-
- /** Constructs a function that returns <tt>a != b ? 1 : 0</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction notEqual(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a != b ? 1 : 0;
- }
- };
- }
-
- /**
- * Constructs a function that returns <tt>a > b ? 1 : 0</tt>. <tt>a</tt>
- * is a variable, <tt>b</tt> is fixed.
- */
- public static DoubleFunction greater(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a > b ? 1 : 0;
- }
- };
- }
-
- /**
- * Constructs a function that returns <tt>Math.IEEEremainder(a,b)</tt>. <tt>a</tt> is a variable, <tt>b</tt> is
- * fixed.
- */
- public static DoubleFunction mathIEEEremainder(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.IEEEremainder(a, b);
- }
- };
- }
-
- /**
- * Constructs a function that returns {@code from<=a && a<=to}. <tt>a</tt>
- * is a variable, <tt>from</tt> and
- * <tt>to</tt> are fixed.
- *
- * Note that DoubleProcedure is generated code and thus looks like an invalid reference unless you can see
- * the generated stuff.
- */
- public static DoubleProcedure isBetween(final double from, final double to) {
- return new DoubleProcedure() {
-
- @Override
- public boolean apply(double a) {
- return from <= a && a <= to;
- }
- };
- }
-
- /**
- * Constructs a function that returns <tt>a == b</tt>. <tt>a</tt> is a
- * variable, <tt>b</tt> is fixed.
- */
- public static DoubleProcedure isEqual(final double b) {
- return new DoubleProcedure() {
-
- @Override
- public boolean apply(double a) {
- return a == b;
- }
- };
- }
-
- /**
- * Constructs a function that returns <tt>a > b</tt>. <tt>a</tt> is a
- * variable, <tt>b</tt> is fixed.
- */
- public static DoubleProcedure isGreater(final double b) {
- return new DoubleProcedure() {
-
- @Override
- public boolean apply(double a) {
- return a > b;
- }
- };
- }
-
- /**
- * Constructs a function that returns {@code a < b}. <tt>a</tt> is a
- * variable, <tt>b</tt> is fixed.
- */
- public static DoubleProcedure isLess(final double b) {
- return new DoubleProcedure() {
-
- @Override
- public boolean apply(double a) {
- return a < b;
- }
- };
- }
-
- /**
- * Constructs a function that returns <tt>a . <tt>a</tt> is a
- * variable, <tt>b</tt> is fixed.
- */
- public static DoubleFunction less(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a Math.log(a) / Math.log(b)</tt>.
- * <tt>a</tt> is a variable, <tt>b</tt> is fixed.
- */
- public static DoubleFunction lg(final double b) {
- return new DoubleFunction() {
- private final double logInv = 1 / Math.log(b); // cached for speed
-
-
- @Override
- public double apply(double a) {
- return Math.log(a) * logInv;
- }
- };
- }
-
- /** Constructs a function that returns <tt>Math.max(a,b)</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction max(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.max(a, b);
- }
- };
- }
-
- /** Constructs a function that returns <tt>Math.min(a,b)</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction min(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return Math.min(a, b);
- }
- };
- }
-
- /** Constructs a function that returns <tt>a - b</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction minus(double b) {
- return plus(-b);
- }
-
- /**
- * Constructs a function that returns <tt>a - b*constant</tt>. <tt>a</tt> and <tt>b</tt> are variables,
- * <tt>constant</tt> is fixed.
- */
- public static DoubleDoubleFunction minusMult(double constant) {
- return plusMult(-constant);
- }
-
- /** Constructs a function that returns <tt>a % b</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction mod(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a % b;
- }
- };
- }
-
- /** Constructs a function that returns <tt>a * b</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction mult(double b) {
- return new Mult(b);
- /*
- return new DoubleFunction() {
- public final double apply(double a) { return a * b; }
- };
- */
- }
-
- /** Constructs a function that returns <tt>a + b</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction plus(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- return a + b;
- }
- };
- }
-
- /**
- * Constructs a function that returns <tt>a + b*constant</tt>. <tt>a</tt> and <tt>b</tt> are variables,
- * <tt>constant</tt> is fixed.
- */
- public static DoubleDoubleFunction plusMult(double constant) {
- return new PlusMult(constant);
- }
-
- /** Constructs a function that returns <tt>Math.pow(a,b)</tt>. <tt>a</tt> is a variable, <tt>b</tt> is fixed. */
- public static DoubleFunction pow(final double b) {
- return new DoubleFunction() {
-
- @Override
- public double apply(double a) {
- if (b == 2) {
- return a * a;
- } else {
- return Math.pow(a, b);
- }
- }
- };
- }
-
- /**
- * Constructs a function that returns a new uniform random number in the open unit interval {@code (0.0,1.0)}
- * (excluding 0.0 and 1.0). Currently the engine is {@link MersenneTwister} and is
- * seeded with the current time. Note that any random engine derived from {@link
- * org.apache.mahout.math.jet.random.engine.RandomEngine} and any random distribution derived from {@link
- * org.apache.mahout.math.jet.random.AbstractDistribution} are function objects, because they implement the proper
- * interfaces. Thus, if you are not happy with the default, just pass your favourite random generator to function
- * evaluating methods.
- */
- public static DoubleFunction random() {
- return new MersenneTwister(new Date());
- }
-
- /**
- * Constructs a function that returns the number rounded to the given precision;
- * <tt>Math.rint(a/precision)*precision</tt>. Examples:
- * {@code
- * precision = 0.01 rounds 0.012 --> 0.01, 0.018 --> 0.02
- * precision = 10 rounds 123 --> 120 , 127 --> 130
- * }
- */
- public static DoubleFunction round(final double precision) {
- return new DoubleFunction() {
- @Override
- public double apply(double a) {
- return Math.rint(a / precision) * precision;
- }
- };
- }
-
- /**
- * Constructs a function that returns <tt>function.apply(b,a)</tt>, i.e. applies the function with the first operand
- * as second operand and the second operand as first operand.
- *
- * @param function a function taking operands in the form <tt>function.apply(a,b)</tt>.
- * @return the binary function <tt>function(b,a)</tt>.
- */
- public static DoubleDoubleFunction swapArgs(final DoubleDoubleFunction function) {
- return new DoubleDoubleFunction() {
- @Override
- public double apply(double a, double b) {
- return function.apply(b, a);
- }
- };
- }
-
- public static DoubleDoubleFunction minusAbsPow(final double exponent) {
- return new DoubleDoubleFunction() {
- @Override
- public double apply(double x, double y) {
- return Math.pow(Math.abs(x - y), exponent);
- }
-
- /**
- * |x - 0|^p = |x|^p != x unless x > 0 and p = 1
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * |0 - y|^p = |y|^p
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * |x - 0|^p = |x|^p
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * |x - y|^p = |y - x|^p
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
-
- /**
- * |x - |y - z|^p|^p != ||x - y|^p - z|^p
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return false;
- }
- };
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/IntFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/IntFunction.java b/math/src/main/java/org/apache/mahout/math/function/IntFunction.java
deleted file mode 100644
index b91fe18..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/IntFunction.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-package org.apache.mahout.math.function;
-
-/**
- * Interface that represents a function object: a function that takes a single argument and returns a single value.
- */
-public interface IntFunction {
-
- /**
- * Applies a function to an argument.
- *
- * @param argument argument passed to the function.
- * @return the result of the function.
- */
- int apply(int argument);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/IntIntDoubleFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/IntIntDoubleFunction.java b/math/src/main/java/org/apache/mahout/math/function/IntIntDoubleFunction.java
deleted file mode 100644
index b08f08b..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/IntIntDoubleFunction.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-package org.apache.mahout.math.function;
-
-/**
- * Interface that represents a function object: a function that takes three arguments.
- */
-public interface IntIntDoubleFunction {
-
- /**
- * Applies a function to three arguments.
- *
- * @param first first argument passed to the function.
- * @param second second argument passed to the function.
- * @param third third argument passed to the function.
- * @return the result of the function.
- */
- double apply(int first, int second, double third);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/IntIntFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/IntIntFunction.java b/math/src/main/java/org/apache/mahout/math/function/IntIntFunction.java
deleted file mode 100644
index f08bb28..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/IntIntFunction.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.function;
-
-/**
- * A function that takes to integer arguments and returns Double.
- */
-public interface IntIntFunction {
- double apply(int first, int second);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/Mult.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/Mult.java b/math/src/main/java/org/apache/mahout/math/function/Mult.java
deleted file mode 100644
index 9bbc5ec..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/Mult.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-package org.apache.mahout.math.function;
-
-/**
- * Only for performance tuning of compute intensive linear algebraic computations.
- * Constructs functions that return one of
- * <ul>
- * <li><tt>a * constant</tt>
- * <li><tt>a / constant</tt>
- * </ul>
- * <tt>a</tt> is variable, <tt>constant</tt> is fixed, but for performance reasons publicly accessible.
- * Intended to be passed to <tt>matrix.assign(function)</tt> methods.
- */
-
-public final class Mult extends DoubleFunction {
-
- private double multiplicator;
-
- Mult(double multiplicator) {
- this.multiplicator = multiplicator;
- }
-
- /** Returns the result of the function evaluation. */
- @Override
- public double apply(double a) {
- return a * multiplicator;
- }
-
- /** <tt>a / constant</tt>. */
- public static Mult div(double constant) {
- return mult(1 / constant);
- }
-
- /** <tt>a * constant</tt>. */
- public static Mult mult(double constant) {
- return new Mult(constant);
- }
-
- public double getMultiplicator() {
- return multiplicator;
- }
-
- public void setMultiplicator(double multiplicator) {
- this.multiplicator = multiplicator;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/ObjectObjectProcedure.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/ObjectObjectProcedure.java b/math/src/main/java/org/apache/mahout/math/function/ObjectObjectProcedure.java
deleted file mode 100644
index 46ad8d0..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/ObjectObjectProcedure.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.mahout.math.function;
-
-/**
- * Interface that represents a procedure object:
- * a procedure that takes two arguments and returns a 'continue' flag.
- */
-public interface ObjectObjectProcedure<K,V> {
-
- /**
- * Applies a procedure to an argument. Optionally can return a boolean flag to inform the object calling the
- * procedure.
- *
- * Example: forEach() methods often use procedure objects. To signal to a forEach() method whether iteration should
- * continue normally or terminate (because for example a matching element has been found), a procedure can return
- * <tt>false</tt> to indicate termination and <tt>true</tt> to indicate continuation.
- *
- * @param key key value passed to the procedure
- * @param value value value passed to the procedure.
- * @return a flag to inform the object calling the procedure.
- */
- boolean apply(K key, V value);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/ObjectProcedure.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/ObjectProcedure.java b/math/src/main/java/org/apache/mahout/math/function/ObjectProcedure.java
deleted file mode 100644
index 8c1b1c8..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/ObjectProcedure.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-
-package org.apache.mahout.math.function;
-
-/**
- * Interface that represents a procedure object: a procedure that takes a single argument and does not return a value.
- */
-public interface ObjectProcedure<T> {
-
- /**
- * Applies a procedure to an argument. Optionally can return a boolean flag to inform the object calling the
- * procedure.
- *
- * Example: forEach() methods often use procedure objects. To signal to a forEach() method whether iteration should
- * continue normally or terminate (because for example a matching element has been found), a procedure can return
- * <tt>false</tt> to indicate termination and <tt>true</tt> to indicate continuation.
- *
- * @param element element passed to the procedure.
- * @return a flag to inform the object calling the procedure.
- */
- boolean apply(T element);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/PlusMult.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/PlusMult.java b/math/src/main/java/org/apache/mahout/math/function/PlusMult.java
deleted file mode 100644
index ff99a70..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/PlusMult.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-package org.apache.mahout.math.function;
-
-import org.apache.mahout.math.jet.math.Constants;
-
-/**
- * Only for performance tuning of compute intensive linear algebraic computations.
- * Constructs functions that return one of
- * <ul>
- * <li><tt>a + b*constant</tt>
- * <li><tt>a - b*constant</tt>
- * <li><tt>a + b/constant</tt>
- * <li><tt>a - b/constant</tt>
- * </ul>
- * <tt>a</tt> and <tt>b</tt> are variables, <tt>constant</tt> is fixed, but for performance reasons publicly accessible.
- * Intended to be passed to <tt>matrix.assign(otherMatrix,function)</tt> methods.
- */
-
-public final class PlusMult extends DoubleDoubleFunction {
-
- private double multiplicator;
-
- public PlusMult(double multiplicator) {
- this.multiplicator = multiplicator;
- }
-
- /** Returns the result of the function evaluation. */
- @Override
- public double apply(double a, double b) {
- return a + b * multiplicator;
- }
-
- /** <tt>a - b*constant</tt>. */
- public static PlusMult minusMult(double constant) {
- return new PlusMult(-constant);
- }
-
- /** <tt>a + b*constant</tt>. */
- public static PlusMult plusMult(double constant) {
- return new PlusMult(constant);
- }
-
- public double getMultiplicator() {
- return multiplicator;
- }
-
- /**
- * x + 0 * c = x
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return true;
- }
-
- /**
- * 0 + y * c = y * c != 0
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * x + 0 * c = x != 0
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * x + y * c = y + x * c iff c = 1
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return Math.abs(multiplicator - 1.0) < Constants.EPSILON;
- }
-
- /**
- * f(x, f(y, z)) = x + c * (y + c * z) = x + c * y + c^2 * z
- * f(f(x, y), z) = (x + c * y) + c * z = x + c * y + c * z
- * true only for c = 0 or c = 1
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return Math.abs(multiplicator - 0.0) < Constants.EPSILON
- || Math.abs(multiplicator - 1.0) < Constants.EPSILON;
- }
-
- public void setMultiplicator(double multiplicator) {
- this.multiplicator = multiplicator;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/SquareRootFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/SquareRootFunction.java b/math/src/main/java/org/apache/mahout/math/function/SquareRootFunction.java
deleted file mode 100644
index 5eebea0..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/SquareRootFunction.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.function;
-
-public final class SquareRootFunction extends DoubleFunction {
-
- @Override
- public double apply(double arg1) {
- return Math.sqrt(arg1);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/TimesFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/TimesFunction.java b/math/src/main/java/org/apache/mahout/math/function/TimesFunction.java
deleted file mode 100644
index e4e27b4..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/TimesFunction.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.function;
-
-public final class TimesFunction extends DoubleDoubleFunction {
-
- /**
- * Computes the product of two numbers.
- *
- * @param x first argument
- * @param y second argument
- * @return the product
- */
- @Override
- public double apply(double x, double y) {
- return x * y;
- }
-
- /**
- * x * 0 = y only if y = 0
- * @return true iff f(x, 0) = x for any x
- */
- @Override
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * 0 * y = 0 for any y
- * @return true iff f(0, y) = 0 for any y
- */
- @Override
- public boolean isLikeLeftMult() {
- return true;
- }
-
- /**
- * x * 0 = 0 for any x
- * @return true iff f(x, 0) = 0 for any x
- */
- @Override
- public boolean isLikeRightMult() {
- return true;
- }
-
- /**
- * x * y = y * x for any x, y
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- @Override
- public boolean isCommutative() {
- return true;
- }
-
- /**
- * x * (y * z) = (x * y) * z for any x, y, z
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- @Override
- public boolean isAssociative() {
- return true;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/VectorFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/VectorFunction.java b/math/src/main/java/org/apache/mahout/math/function/VectorFunction.java
deleted file mode 100644
index 3b5af77..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/VectorFunction.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.function;
-
-import org.apache.mahout.math.Vector;
-
-/**
- * Defines a function of a vector that returns a double.
- */
-public interface VectorFunction {
- double apply(Vector f);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/package-info.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/package-info.java b/math/src/main/java/org/apache/mahout/math/function/package-info.java
deleted file mode 100644
index 47ceace..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/package-info.java
+++ /dev/null
@@ -1,4 +0,0 @@
-/**
- * Core interfaces for functions, comparisons and procedures on objects and primitive data types.
- */
-package org.apache.mahout.math.function;

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/math/Arithmetic.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/math/Arithmetic.java b/math/src/main/java/org/apache/mahout/math/jet/math/Arithmetic.java
deleted file mode 100644
index 83d512b..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/math/Arithmetic.java
+++ /dev/null
@@ -1,328 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.math;
-
-/**
- * Arithmetic functions.
- */
-public final class Arithmetic {
-
- // for method logFactorial(...)
- // log(k!) for k = 0, ..., 29
- private static final double[] LOG_FACTORIAL_TABLE = {
- 0.00000000000000000, 0.00000000000000000, 0.69314718055994531,
- 1.79175946922805500, 3.17805383034794562, 4.78749174278204599,
- 6.57925121201010100, 8.52516136106541430, 10.60460290274525023,
- 12.80182748008146961, 15.10441257307551530, 17.50230784587388584,
- 19.98721449566188615, 22.55216385312342289, 25.19122118273868150,
- 27.89927138384089157, 30.67186010608067280, 33.50507345013688888,
- 36.39544520803305358, 39.33988418719949404, 42.33561646075348503,
- 45.38013889847690803, 48.47118135183522388, 51.60667556776437357,
- 54.78472939811231919, 58.00360522298051994, 61.26170176100200198,
- 64.55753862700633106, 67.88974313718153498, 71.25703896716800901
- };
-
- // k! for k = 0, ..., 20
- private static final long[] FACTORIAL_TABLE = {
- 1L,
- 1L,
- 2L,
- 6L,
- 24L,
- 120L,
- 720L,
- 5040L,
- 40320L,
- 362880L,
- 3628800L,
- 39916800L,
- 479001600L,
- 6227020800L,
- 87178291200L,
- 1307674368000L,
- 20922789888000L,
- 355687428096000L,
- 6402373705728000L,
- 121645100408832000L,
- 2432902008176640000L
- };
-
- // k! for k = 21, ..., 170
- private static final double[] LARGE_FACTORIAL_TABLE = {
- 5.109094217170944E19,
- 1.1240007277776077E21,
- 2.585201673888498E22,
- 6.204484017332394E23,
- 1.5511210043330984E25,
- 4.032914611266057E26,
- 1.0888869450418352E28,
- 3.048883446117138E29,
- 8.841761993739701E30,
- 2.652528598121911E32,
- 8.222838654177924E33,
- 2.6313083693369355E35,
- 8.68331761881189E36,
- 2.952327990396041E38,
- 1.0333147966386144E40,
- 3.719933267899013E41,
- 1.3763753091226346E43,
- 5.23022617466601E44,
- 2.0397882081197447E46,
- 8.15915283247898E47,
- 3.34525266131638E49,
- 1.4050061177528801E51,
- 6.041526306337384E52,
- 2.6582715747884495E54,
- 1.196222208654802E56,
- 5.502622159812089E57,
- 2.5862324151116827E59,
- 1.2413915592536068E61,
- 6.082818640342679E62,
- 3.0414093201713376E64,
- 1.5511187532873816E66,
- 8.06581751709439E67,
- 4.274883284060024E69,
- 2.308436973392413E71,
- 1.2696403353658264E73,
- 7.109985878048632E74,
- 4.052691950487723E76,
- 2.350561331282879E78,
- 1.386831185456898E80,
- 8.32098711274139E81,
- 5.075802138772246E83,
- 3.146997326038794E85,
- 1.9826083154044396E87,
- 1.2688693218588414E89,
- 8.247650592082472E90,
- 5.443449390774432E92,
- 3.6471110918188705E94,
- 2.48003554243683E96,
- 1.7112245242814127E98,
- 1.1978571669969892E100,
- 8.504785885678624E101,
- 6.123445837688612E103,
- 4.470115461512686E105,
- 3.307885441519387E107,
- 2.4809140811395404E109,
- 1.8854947016660506E111,
- 1.451830920282859E113,
- 1.1324281178206295E115,
- 8.94618213078298E116,
- 7.15694570462638E118,
- 5.797126020747369E120,
- 4.7536433370128435E122,
- 3.94552396972066E124,
- 3.314240134565354E126,
- 2.8171041143805494E128,
- 2.4227095383672744E130,
- 2.107757298379527E132,
- 1.854826422573984E134,
- 1.6507955160908465E136,
- 1.4857159644817605E138,
- 1.3520015276784033E140,
- 1.2438414054641305E142,
- 1.156772507081641E144,
- 1.0873661566567426E146,
- 1.0329978488239061E148,
- 9.916779348709491E149,
- 9.619275968248216E151,
- 9.426890448883248E153,
- 9.332621544394415E155,
- 9.332621544394418E157,
- 9.42594775983836E159,
- 9.614466715035125E161,
- 9.902900716486178E163,
- 1.0299016745145631E166,
- 1.0813967582402912E168,
- 1.1462805637347086E170,
- 1.2265202031961373E172,
- 1.324641819451829E174,
- 1.4438595832024942E176,
- 1.5882455415227423E178,
- 1.7629525510902457E180,
- 1.974506857221075E182,
- 2.2311927486598138E184,
- 2.543559733472186E186,
- 2.925093693493014E188,
- 3.393108684451899E190,
- 3.96993716080872E192,
- 4.6845258497542896E194,
- 5.574585761207606E196,
- 6.689502913449135E198,
- 8.094298525273444E200,
- 9.875044200833601E202,
- 1.2146304367025332E205,
- 1.506141741511141E207,
- 1.882677176888926E209,
- 2.3721732428800483E211,
- 3.0126600184576624E213,
- 3.856204823625808E215,
- 4.974504222477287E217,
- 6.466855489220473E219,
- 8.471580690878813E221,
- 1.1182486511960037E224,
- 1.4872707060906847E226,
- 1.99294274616152E228,
- 2.690472707318049E230,
- 3.6590428819525483E232,
- 5.0128887482749884E234,
- 6.917786472619482E236,
- 9.615723196941089E238,
- 1.3462012475717523E241,
- 1.8981437590761713E243,
- 2.6953641378881633E245,
- 3.8543707171800694E247,
- 5.550293832739308E249,
- 8.047926057471989E251,
- 1.1749972043909107E254,
- 1.72724589045464E256,
- 2.5563239178728637E258,
- 3.8089226376305687E260,
- 5.7133839564458575E262,
- 8.627209774233244E264,
- 1.3113358856834527E267,
- 2.0063439050956838E269,
- 3.0897696138473515E271,
- 4.789142901463393E273,
- 7.471062926282892E275,
- 1.1729568794264134E278,
- 1.8532718694937346E280,
- 2.946702272495036E282,
- 4.714723635992061E284,
- 7.590705053947223E286,
- 1.2296942187394494E289,
- 2.0044015765453032E291,
- 3.287218585534299E293,
- 5.423910666131583E295,
- 9.003691705778434E297,
- 1.5036165148649983E300,
- 2.5260757449731988E302,
- 4.2690680090047056E304,
- 7.257415615308004E306
- };
-
- private Arithmetic() {
- }
-
- /**
- * Efficiently returns the binomial coefficient, often also referred to as "n over k" or "n choose k". The binomial
- * coefficient is defined as <ul>
- * <li><tt>k<0</tt>: <tt>0</tt>.</li>
- * <li><tt>k==0 || k==n</tt>: <tt>1</tt>.</li>
- * <li><tt>k==1 || k==n-1</tt>: <tt>n</tt>.</li>
- * <li>else: <tt>(n * n-1 * ... * n-k+1 ) / ( 1 * 2 * ... * k )</tt>.</li>
- * </ul>
- *
- * @return the binomial coefficient.
- */
- public static double binomial(long n, long k) {
- if (k < 0) {
- return 0;
- }
- if (k == 0 || k == n) {
- return 1;
- }
- if (k == 1 || k == n - 1) {
- return n;
- }
-
- // try quick version and see whether we get numeric overflows.
- // factorial(..) is O(1); requires no loop; only a table lookup.
- if (n > k) {
- int max = FACTORIAL_TABLE.length + LARGE_FACTORIAL_TABLE.length;
- if (n < max) { // if (n! < inf && k! < inf)
- double nFactorial = factorial((int) n);
- double kFactorial = factorial((int) k);
- double nMinusKFactorial = factorial((int) (n - k));
- double nk = nMinusKFactorial * kFactorial;
- if (nk != Double.POSITIVE_INFINITY) { // no numeric overflow?
- // now this is completely safe and accurate
- return nFactorial / nk;
- }
- }
- if (k > n / 2) {
- k = n - k;
- } // quicker
- }
-
- // binomial(n,k) = (n * n-1 * ... * n-k+1 ) / ( 1 * 2 * ... * k )
- long a = n - k + 1;
- long b = 1;
- double binomial = 1;
- for (long i = k; i-- > 0;) {
- binomial *= (double) a++ / b++;
- }
- return binomial;
- }
-
- /**
- * Instantly returns the factorial <tt>k!</tt>.
- *
- * @param k must hold <tt>k >= 0</tt>.
- */
- private static double factorial(int k) {
- if (k < 0) {
- throw new IllegalArgumentException();
- }
-
- int length1 = FACTORIAL_TABLE.length;
- if (k < length1) {
- return FACTORIAL_TABLE[k];
- }
-
- int length2 = LARGE_FACTORIAL_TABLE.length;
- if (k < length1 + length2) {
- return LARGE_FACTORIAL_TABLE[k - length1];
- } else {
- return Double.POSITIVE_INFINITY;
- }
- }
-
- /**
- * Returns <tt>log(k!)</tt>. Tries to avoid overflows. For {@code k<30} simply
- * looks up a table in <tt>O(1)</tt>. For {@code k>=30} uses stirlings
- * approximation.
- *
- * @param k must hold <tt>k >= 0</tt>.
- */
- public static double logFactorial(int k) {
- if (k >= 30) {
-
- double r = 1.0 / k;
- double rr = r * r;
- double c7 = -5.95238095238095238e-04;
- double c5 = 7.93650793650793651e-04;
- double c3 = -2.77777777777777778e-03;
- double c1 = 8.33333333333333333e-02;
- double c0 = 9.18938533204672742e-01;
- return (k + 0.5) * Math.log(k) - k + c0 + r * (c1 + rr * (c3 + rr * (c5 + rr * c7)));
- } else {
- return LOG_FACTORIAL_TABLE[k];
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/math/Constants.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/math/Constants.java b/math/src/main/java/org/apache/mahout/math/jet/math/Constants.java
deleted file mode 100644
index b99340d..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/math/Constants.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.math;
-
-/**
- * Defines some useful constants.
- */
-public final class Constants {
-
- public static final double MACHEP = 1.11022302462515654042E-16;
- public static final double MAXLOG = 7.09782712893383996732E2;
- public static final double MINLOG = -7.451332191019412076235E2;
- public static final double MAXGAM = 171.624376956302725;
- public static final double SQTPI = 2.50662827463100050242E0;
- public static final double LOGPI = 1.14472988584940017414;
-
- public static final double BIG = 4.503599627370496e15;
- public static final double BIG_INVERSE = 2.22044604925031308085e-16;
-
- public static final double EPSILON = 1.0E-6;
-
- private Constants() {
- }
-}

r***@apache.org

2018-06-27 14:51:32 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/random/sampling/RandomSampler.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/random/sampling/RandomSampler.java b/math/src/main/java/org/apache/mahout/math/jet/random/sampling/RandomSampler.java
deleted file mode 100644
index 6804547..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/random/sampling/RandomSampler.java
+++ /dev/null
@@ -1,503 +0,0 @@
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.random.sampling;
-
-import org.apache.mahout.common.RandomUtils;
-
-import java.util.Random;
-
-/**
- * Space and time efficiently computes a sorted Simple Random Sample Without Replacement
- * (SRSWOR), that is, a sorted set of <tt>n</tt> random numbers from an interval of <tt>N</tt> numbers;
- * Example: Computing <tt>n=3</tt> random numbers from the interval <tt>[1,50]</tt> may yield
- * the sorted random set <tt>(7,13,47)</tt>.
- * Since we are talking about a set (sampling without replacement), no element will occur more than once.
- * Each number from the <tt>N</tt> numbers has the same probability to be included in the <tt>n</tt> chosen numbers.
- *
- * Problem: This class solves problems including the following: 
- * Suppose we have a file containing 10^12 objects.
- * We would like to take a truly random subset of 10^6 objects and do something with it,
- * for example, compute the sum over some instance field, or whatever.
- * How do we choose the subset? In particular, how do we avoid multiple equal elements?
- * How do we do this quick and without consuming excessive memory?
- * How do we avoid slowly jumping back and forth within the file? 
- *
- * Sorted Simple Random Sample Without Replacement (SRSWOR):
- * What are the exact semantics of this class? What is a SRSWOR? In which sense exactly is a returned set "random"?
- * It is random in the sense, that each number from the <tt>N</tt> numbers has the
- * same probability to be included in the <tt>n</tt> chosen numbers.
- * For those who think in implementations rather than abstract interfaces:
- * Suppose, we have an empty list.
- * We pick a random number between 1 and 10^12 and add it to the list only if it was not
- * already picked before, i.e. if it is not already contained in the list.
- * We then do the same thing again and again until we have eventually collected 10^6 distinct numbers.
- * Now we sort the set ascending and return it.
- * <dl>
- * <dt>It is exactly in this sense that this class returns "random" sets.
- * Note, however, that the implementation of this class uses a technique orders of magnitudes
- * better (both in time and space) than the one outlined above.</dt></dl>
- *
- * Performance: Space requirements are zero. Running time is <tt>O(n)</tt> on average,
- * <tt>O(N)</tt> in the worst case.
- * <h2>Performance (200Mhz Pentium Pro, JDK 1.2, NT)</h2>
- * <center>
- * <table border="1" summary="performance table">
- * <tr>
- * <td align="center" width="20%">n</td>
- * <td align="center" width="20%">N</td>
- * <td align="center" width="20%">Speed [seconds]</td>
- * </tr>
- * <tr>
- * <td align="center" width="20%">103</td>
- * <td align="center" width="20%">1.2*103</td>
- * <td align="center" width="20">0.0014</td>
- * </tr>
- * <tr>
- * <td align="center" width="20%">103</td>
- * <td align="center" width="20%">107</td>
- * <td align="center" width="20">0.006</td>
- * </tr>
- * <tr>
- * <td align="center" width="20%">105</td>
- * <td align="center" width="20%">107</td>
- * <td align="center" width="20">0.7</td>
- * </tr>
- * <tr>
- * <td align="center" width="20%">9.0*106</td>
- * <td align="center" width="20%">107</td>
- * <td align="center" width="20">8.5</td>
- * </tr>
- * <tr>
- * <td align="center" width="20%">9.9*106</td>
- * <td align="center" width="20%">107</td>
- * <td align="center" width="20">2.0 (samples more than 95%)</td>
- * </tr>
- * <tr>
- * <td align="center" width="20%">104</td>
- * <td align="center" width="20%">1012</td>
- * <td align="center" width="20">0.07</td>
- * </tr>
- * <tr>
- * <td align="center" width="20%">107</td>
- * <td align="center" width="20%">1012</td>
- * <td align="center" width="20">60</td>
- * </tr>
- * </table>
- * </center>
- *
- * Scalability: This random sampler is designed to be scalable. In iterator style,
- * it is able to compute and deliver sorted random sets stepwise in units called blocks.
- * Example: Computing <tt>n=9</tt> random numbers from the interval <tt>[1,50]</tt> in
- * 3 blocks may yield the blocks <tt>(7,13,14), (27,37,42), (45,46,49)</tt>.
- * (The maximum of a block is guaranteed to be less than the minimum of its successor block.
- * Every block is sorted ascending. No element will ever occur twice, both within a block and among blocks.)
- * A block can be computed and retrieved with method <tt>nextBlock</tt>.
- * Successive calls to method <tt>nextBlock</tt> will deliver as many random numbers as required.
- *
- * Computing and retrieving samples in blocks is useful if you need very many random
- * numbers that cannot be stored in main memory at the same time.
- * For example, if you want to compute 10^10 such numbers you can do this by computing
- * them in blocks of, say, 500 elements each.
- * You then need only space to keep one block of 500 elements (i.e. 4 KB).
- * When you are finished processing the first 500 elements you call <tt>nextBlock</tt> to
- * fill the next 500 elements into the block, process them, and so on.
- * If you have the time and need, by using such blocks you can compute random sets
- * up to <tt>n=10^19</tt> random numbers.
- *
- * If you do not need the block feature, you can also directly call
- * the static methods of this class without needing to construct a <tt>RandomSampler</tt> instance first.
- *
- * Random number generation: By default uses <tt>MersenneTwister</tt>, a very
- * strong random number generator, much better than <tt>java.util.Random</tt>.
- * You can also use other strong random number generators of Paul Houle's RngPack package.
- * For example, <tt>Ranecu</tt>, <tt>Ranmar</tt> and <tt>Ranlux</tt> are strong well
- * analyzed research grade pseudo-random number generators with known periods.
- *
- * Implementation: after J.S. Vitter, An Efficient Algorithm for Sequential Random Sampling,
- * ACM Transactions on Mathematical Software, Vol 13, 1987.
- * Paper available <A HREF="http://www.cs.duke.edu/~jsv"> here</A>.
- */
-public final class RandomSampler {
-
- private RandomSampler() {
- }
-
- /**
- * Efficiently computes a sorted random set of <tt>count</tt> elements from the interval <tt>[low,low+N-1]</tt>. Since
- * we are talking about a random set, no element will occur more than once.
- *
- * Running time is <tt>O(count)</tt>, on average. Space requirements are zero.
- *
- * Numbers are filled into the specified array starting at index <tt>fromIndex</tt> to the right. The array is
- * returned sorted ascending in the range filled with numbers.
- *
- * @param n the total number of elements to choose (must be >= 0).
- * @param N the interval to choose random numbers from is <tt>[low,low+N-1]</tt>.
- * @param count the number of elements to be filled into <tt>values</tt> by this call (must be >= 0 and
- * <=<tt>n</tt>). Normally, you will set <tt>count=n</tt>.
- * @param low the interval to choose random numbers from is <tt>[low,low+N-1]</tt>. Hint: If
- * <tt>low==0</tt>, then draws random numbers from the interval <tt>[0,N-1]</tt>.
- * @param values the array into which the random numbers are to be filled; must have a length <tt>>=
- * count+fromIndex</tt>.
- * @param fromIndex the first index within <tt>values</tt> to be filled with numbers (inclusive).
- * @param randomGenerator a random number generator.
- */
- private static void rejectMethodD(long n, long N, int count, long low, long[] values, int fromIndex,
- Random randomGenerator) {
- /* This algorithm is applicable if a large percentage (90%..100%) of N shall be sampled.
- In such cases it is more efficient than sampleMethodA() and sampleMethodD().
- The idea is that it is more efficient to express
- sample(n,N,count) in terms of reject(N-n,N,count)
- and then invert the result.
- For example, sampling 99% turns into sampling 1% plus inversion.
-
- This algorithm is the same as method sampleMethodD(...) with the exception that sampled elements are rejected,
- and not sampled elements included in the result set.
- */
- n = N - n; // IMPORTANT !!!
-
- //long threshold;
- long chosen = -1 + low;
-
- //long negalphainv =
- // -13; //tuning paramter, determines when to switch from method D to method A. Dependent on programming
- // language, platform, etc.
-
- double nreal = n;
- double ninv = 1.0 / nreal;
- double Nreal = N;
- double Vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * ninv);
- long qu1 = -n + 1 + N;
- double qu1real = -nreal + 1.0 + Nreal;
- //threshold = -negalphainv * n;
-
- long S;
- while (n > 1 && count > 0) { //&& threshold<N) {
- double nmin1inv = 1.0 / (-1.0 + nreal);
- double negSreal;
- while (true) {
- double X;
- while (true) { // step D2: generate U and X
- X = Nreal * (-Vprime + 1.0);
- S = (long) X;
- if (S < qu1) {
- break;
- }
- Vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * ninv);
- }
- double U = randomGenerator.nextDouble();
- negSreal = -S;
-
- //step D3: Accept?
- double y1 = Math.exp(Math.log(U * Nreal / qu1real) * nmin1inv);
- Vprime = y1 * (-X / Nreal + 1.0) * qu1real / (negSreal + qu1real);
- if (Vprime <= 1.0) {
- break;
- } //break inner loop
-
- //step D4: Accept?
- double top = -1.0 + Nreal;
- long limit;
- double bottom;
- if (n - 1 > S) {
- bottom = -nreal + Nreal;
- limit = -S + N;
- } else {
- bottom = -1.0 + negSreal + Nreal;
- limit = qu1;
- }
- double y2 = 1.0;
- for (long t = N - 1; t >= limit; t--) {
- y2 *= top / bottom;
- top--;
- bottom--;
- }
- if (Nreal / (-X + Nreal) >= y1 * Math.exp(Math.log(y2) * nmin1inv)) {
- // accept !
- Vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * nmin1inv);
- break; //break inner loop
- }
- Vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * ninv);
- }
-
- //step D5: reject the (S+1)st record !
- int iter = count; //int iter = (int) (Math.min(S,count));
- if (S < iter) {
- iter = (int) S;
- }
-
- count -= iter;
- while (--iter >= 0) {
- values[fromIndex++] = ++chosen;
- }
- chosen++;
-
- N -= S + 1;
- Nreal = negSreal - 1.0 + Nreal;
- n--;
- nreal--;
- ninv = nmin1inv;
- qu1 = -S + qu1;
- qu1real = negSreal + qu1real;
- //threshold += negalphainv;
- } //end while
-
-
- if (count > 0) { //special case n==1
- //reject the (S+1)st record !
- S = (long) (N * Vprime);
-
- int iter = count; //int iter = (int) (Math.min(S,count));
- if (S < iter) {
- iter = (int) S;
- }
-
- count -= iter;
- while (--iter >= 0) {
- values[fromIndex++] = ++chosen;
- }
-
- chosen++;
-
- // fill the rest
- while (--count >= 0) {
- values[fromIndex++] = ++chosen;
- }
- }
- }
-
- /**
- * Efficiently computes a sorted random set of <tt>count</tt> elements from the interval <tt>[low,low+N-1]</tt>. Since
- * we are talking about a random set, no element will occur more than once.
- *
- * Running time is <tt>O(count)</tt>, on average. Space requirements are zero.
- *
- * Numbers are filled into the specified array starting at index <tt>fromIndex</tt> to the right. The array is
- * returned sorted ascending in the range filled with numbers.
- *
- * Random number generation: By default uses <tt>MersenneTwister</tt>, a very strong random number
- * generator, much better than <tt>java.util.Random</tt>. You can also use other strong random number generators of
- * Paul Houle's RngPack package. For example, <tt>Ranecu</tt>, <tt>Ranmar</tt> and <tt>Ranlux</tt> are strong well
- * analyzed research grade pseudo-random number generators with known periods.
- *
- * @param n the total number of elements to choose (must be <tt>n >= 0</tt> and <tt>n <= N</tt>).
- * @param N the interval to choose random numbers from is <tt>[low,low+N-1]</tt>.
- * @param count the number of elements to be filled into <tt>values</tt> by this call (must be >= 0 and
- * <=<tt>n</tt>). Normally, you will set <tt>count=n</tt>.
- * @param low the interval to choose random numbers from is <tt>[low,low+N-1]</tt>. Hint: If
- * <tt>low==0</tt>, then draws random numbers from the interval <tt>[0,N-1]</tt>.
- * @param values the array into which the random numbers are to be filled; must have a length <tt>>=
- * count+fromIndex</tt>.
- * @param fromIndex the first index within <tt>values</tt> to be filled with numbers (inclusive).
- * @param randomGenerator a random number generator. Set this parameter to <tt>null</tt> to use the default random
- * number generator.
- */
- public static void sample(long n, long N, int count, long low, long[] values, int fromIndex,
- Random randomGenerator) {
- if (n <= 0 || count <= 0) {
- return;
- }
- if (count > n) {
- throw new IllegalArgumentException("count must not be greater than n");
- }
- if (randomGenerator == null) {
- randomGenerator = RandomUtils.getRandom();
- }
-
- if (count == N) { // rare case treated quickly
- long val = low;
- int limit = fromIndex + count;
- for (int i = fromIndex; i < limit; i++) {
- values[i] = val++;
- }
- return;
- }
-
- if (n < N * 0.95) { // || Math.min(count,N-n)>maxTmpMemoryAllowed) {
- sampleMethodD(n, N, count, low, values, fromIndex, randomGenerator);
- } else { // More than 95% of all numbers shall be sampled.
- rejectMethodD(n, N, count, low, values, fromIndex, randomGenerator);
- }
-
-
- }
-
- /**
- * Computes a sorted random set of <tt>count</tt> elements from the interval <tt>[low,low+N-1]</tt>. Since we are
- * talking about a random set, no element will occur more than once.
- *
- * Running time is <tt>O(N)</tt>, on average. Space requirements are zero.
- *
- * Numbers are filled into the specified array starting at index <tt>fromIndex</tt> to the right. The array is
- * returned sorted ascending in the range filled with numbers.
- *
- * @param n the total number of elements to choose (must be >= 0).
- * @param N the interval to choose random numbers from is <tt>[low,low+N-1]</tt>.
- * @param count the number of elements to be filled into <tt>values</tt> by this call (must be >= 0 and
- * <=<tt>n</tt>). Normally, you will set <tt>count=n</tt>.
- * @param low the interval to choose random numbers from is <tt>[low,low+N-1]</tt>. Hint: If
- * <tt>low==0</tt>, then draws random numbers from the interval <tt>[0,N-1]</tt>.
- * @param values the array into which the random numbers are to be filled; must have a length <tt>>=
- * count+fromIndex</tt>.
- * @param fromIndex the first index within <tt>values</tt> to be filled with numbers (inclusive).
- * @param randomGenerator a random number generator.
- */
- private static void sampleMethodA(long n, long N, int count, long low, long[] values, int fromIndex,
- Random randomGenerator) {
- long chosen = -1 + low;
-
- double top = N - n;
- double Nreal = N;
- long S;
- while (n >= 2 && count > 0) {
- double V = randomGenerator.nextDouble();
- S = 0;
- double quot = top / Nreal;
- while (quot > V) {
- S++;
- top--;
- Nreal--;
- quot *= top / Nreal;
- }
- chosen += S + 1;
- values[fromIndex++] = chosen;
- count--;
- Nreal--;
- n--;
- }
-
- if (count > 0) {
- // special case n==1
- S = (long) (Math.round(Nreal) * randomGenerator.nextDouble());
- chosen += S + 1;
- values[fromIndex] = chosen;
- }
- }
-
- /**
- * Efficiently computes a sorted random set of <tt>count</tt> elements from the interval <tt>[low,low+N-1]</tt>. Since
- * we are talking about a random set, no element will occur more than once.
- *
- * Running time is <tt>O(count)</tt>, on average. Space requirements are zero.
- *
- * Numbers are filled into the specified array starting at index <tt>fromIndex</tt> to the right. The array is
- * returned sorted ascending in the range filled with numbers.
- *
- * @param n the total number of elements to choose (must be >= 0).
- * @param N the interval to choose random numbers from is <tt>[low,low+N-1]</tt>.
- * @param count the number of elements to be filled into <tt>values</tt> by this call (must be >= 0 and
- * <=<tt>n</tt>). Normally, you will set <tt>count=n</tt>.
- * @param low the interval to choose random numbers from is <tt>[low,low+N-1]</tt>. Hint: If
- * <tt>low==0</tt>, then draws random numbers from the interval <tt>[0,N-1]</tt>.
- * @param values the array into which the random numbers are to be filled; must have a length <tt>>=
- * count+fromIndex</tt>.
- * @param fromIndex the first index within <tt>values</tt> to be filled with numbers (inclusive).
- * @param randomGenerator a random number generator.
- */
- private static void sampleMethodD(long n, long N, int count, long low, long[] values, int fromIndex,
- Random randomGenerator) {
- long chosen = -1 + low;
-
- double nreal = n;
- double ninv = 1.0 / nreal;
- double Nreal = N;
- double vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * ninv);
- long qu1 = -n + 1 + N;
- double qu1real = -nreal + 1.0 + Nreal;
- long negalphainv = -13;
- //tuning paramter, determines when to switch from method D to method A. Dependent on programming
- // language, platform, etc.
- long threshold = -negalphainv * n;
-
- long S;
- while (n > 1 && count > 0 && threshold < N) {
- double nmin1inv = 1.0 / (-1.0 + nreal);
- double negSreal;
- while (true) {
- double X;
- while (true) { // step D2: generate U and X
- X = Nreal * (-vprime + 1.0);
- S = (long) X;
- if (S < qu1) {
- break;
- }
- vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * ninv);
- }
- double U = randomGenerator.nextDouble();
- negSreal = -S;
-
- //step D3: Accept?
- double y1 = Math.exp(Math.log(U * Nreal / qu1real) * nmin1inv);
- vprime = y1 * (-X / Nreal + 1.0) * qu1real / (negSreal + qu1real);
- if (vprime <= 1.0) {
- break;
- } //break inner loop
-
- //step D4: Accept?
- double top = -1.0 + Nreal;
- long limit;
- double bottom;
- if (n - 1 > S) {
- bottom = -nreal + Nreal;
- limit = -S + N;
- } else {
- bottom = -1.0 + negSreal + Nreal;
- limit = qu1;
- }
- double y2 = 1.0;
- for (long t = N - 1; t >= limit; t--) {
- y2 *= top / bottom;
- top--;
- bottom--;
- }
- if (Nreal / (-X + Nreal) >= y1 * Math.exp(Math.log(y2) * nmin1inv)) {
- // accept !
- vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * nmin1inv);
- break; //break inner loop
- }
- vprime = Math.exp(Math.log(randomGenerator.nextDouble()) * ninv);
- }
-
- //step D5: select the (S+1)st record !
- chosen += S + 1;
- values[fromIndex++] = chosen;
- /*
- // invert
- for (int iter=0; iter<S && count > 0; iter++) {
- values[fromIndex++] = ++chosen;
- count--;
- }
- chosen++;
- */
- count--;
-
- N -= S + 1;
- Nreal = negSreal - 1.0 + Nreal;
- n--;
- nreal--;
- ninv = nmin1inv;
- qu1 = -S + qu1;
- qu1real = negSreal + qu1real;
- threshold += negalphainv;
- } //end while
-
-
- if (count > 0) {
- if (n > 1) { //faster to use method A to finish the sampling
- sampleMethodA(n, N, count, chosen + 1, values, fromIndex, randomGenerator);
- } else {
- //special case n==1
- S = (long) (N * vprime);
- chosen += S + 1;
- values[fromIndex++] = chosen;
- }
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java b/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java
deleted file mode 100644
index 3ab61a6..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/stat/Gamma.java
+++ /dev/null
@@ -1,681 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.stat;
-
-import org.apache.mahout.math.jet.math.Constants;
-import org.apache.mahout.math.jet.math.Polynomial;
-
-/** Partially deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */
-public final class Gamma {
-
- private static final double MAXSTIR = 143.01608;
-
- private Gamma() {
- }
-
- /**
- * Returns the beta function of the arguments.
- * <pre>
- * - -
- * | (a) | (b)
- * beta( a, b ) = -----------.
- * -
- * | (a+b)
- * </pre>
- * @param alpha
- * @param beta
- * @return The beta function for given values of alpha and beta.
- */
- public static double beta(double alpha, double beta) {
- double y;
- if (alpha < 40 && beta < 40) {
- y = gamma(alpha + beta);
- if (y == 0.0) {
- return 1.0;
- }
-
- if (alpha > beta) {
- y = gamma(alpha) / y;
- y *= gamma(beta);
- } else {
- y = gamma(beta) / y;
- y *= gamma(alpha);
- }
- } else {
- y = Math.exp(logGamma(alpha) + logGamma(beta) - logGamma(alpha + beta));
- }
-
- return y;
- }
-
- /** Returns the Gamma function of the argument. */
- public static double gamma(double x) {
-
- double[] pCoefficient = {
- 1.60119522476751861407E-4,
- 1.19135147006586384913E-3,
- 1.04213797561761569935E-2,
- 4.76367800457137231464E-2,
- 2.07448227648435975150E-1,
- 4.94214826801497100753E-1,
- 9.99999999999999996796E-1
- };
- double[] qCoefficient = {
- -2.31581873324120129819E-5,
- 5.39605580493303397842E-4,
- -4.45641913851797240494E-3,
- 1.18139785222060435552E-2,
- 3.58236398605498653373E-2,
- -2.34591795718243348568E-1,
- 7.14304917030273074085E-2,
- 1.00000000000000000320E0
- };
-//double MAXGAM = 171.624376956302725;
-//double LOGPI = 1.14472988584940017414;
-
- double p;
- double z;
-
- double q = Math.abs(x);
-
- if (q > 33.0) {
- if (x < 0.0) {
- p = Math.floor(q);
- if (p == q) {
- throw new ArithmeticException("gamma: overflow");
- }
- //int i = (int) p;
- z = q - p;
- if (z > 0.5) {
- p += 1.0;
- z = q - p;
- }
- z = q * Math.sin(Math.PI * z);
- if (z == 0.0) {
- throw new ArithmeticException("gamma: overflow");
- }
- z = Math.abs(z);
- z = Math.PI / (z * stirlingFormula(q));
-
- return -z;
- } else {
- return stirlingFormula(x);
- }
- }
-
- z = 1.0;
- while (x >= 3.0) {
- x -= 1.0;
- z *= x;
- }
-
- while (x < 0.0) {
- if (x == 0.0) {
- throw new ArithmeticException("gamma: singular");
- }
- if (x > -1.0e-9) {
- return z / ((1.0 + 0.5772156649015329 * x) * x);
- }
- z /= x;
- x += 1.0;
- }
-
- while (x < 2.0) {
- if (x == 0.0) {
- throw new ArithmeticException("gamma: singular");
- }
- if (x < 1.0e-9) {
- return z / ((1.0 + 0.5772156649015329 * x) * x);
- }
- z /= x;
- x += 1.0;
- }
-
- if ((x == 2.0) || (x == 3.0)) {
- return z;
- }
-
- x -= 2.0;
- p = Polynomial.polevl(x, pCoefficient, 6);
- q = Polynomial.polevl(x, qCoefficient, 7);
- return z * p / q;
-
- }
-
- /**
- * Returns the regularized Incomplete Beta Function evaluated from zero to <tt>xx</tt>; formerly named <tt>ibeta</tt>.
- *
- * See http://en.wikipedia.org/wiki/Incomplete_beta_function#Incomplete_beta_function
- *
- * @param alpha the alpha parameter of the beta distribution.
- * @param beta the beta parameter of the beta distribution.
- * @param xx the integration end point.
- */
- public static double incompleteBeta(double alpha, double beta, double xx) {
-
- if (alpha <= 0.0) {
- throw new ArithmeticException("incompleteBeta: Domain error! alpha must be > 0, but was " + alpha);
- }
-
- if (beta <= 0.0) {
- throw new ArithmeticException("incompleteBeta: Domain error! beta must be > 0, but was " + beta);
- }
-
- if (xx <= 0.0) {
- return 0.0;
- }
-
- if (xx >= 1.0) {
- return 1.0;
- }
-
- double t;
- if ((beta * xx) <= 1.0 && xx <= 0.95) {
- t = powerSeries(alpha, beta, xx);
- return t;
- }
-
- double w = 1.0 - xx;
-
- /* Reverse a and b if x is greater than the mean. */
- double xc;
- double x;
- double b;
- double a;
- boolean flag = false;
- if (xx > (alpha / (alpha + beta))) {
- flag = true;
- a = beta;
- b = alpha;
- xc = xx;
- x = w;
- } else {
- a = alpha;
- b = beta;
- xc = w;
- x = xx;
- }
-
- if (flag && (b * x) <= 1.0 && x <= 0.95) {
- t = powerSeries(a, b, x);
- t = t <= Constants.MACHEP ? 1.0 - Constants.MACHEP : 1.0 - t;
- return t;
- }
-
- /* Choose expansion for better convergence. */
- double y = x * (a + b - 2.0) - (a - 1.0);
- w = y < 0.0 ? incompleteBetaFraction1(a, b, x) : incompleteBetaFraction2(a, b, x) / xc;
-
- /* Multiply w by the factor
- a b _ _ _
- x (1-x) | (a+b) / ( a | (a) | (b) ) . */
-
- y = a * Math.log(x);
- t = b * Math.log(xc);
- if ((a + b) < Constants.MAXGAM && Math.abs(y) < Constants.MAXLOG && Math.abs(t) < Constants.MAXLOG) {
- t = Math.pow(xc, b);
- t *= Math.pow(x, a);
- t /= a;
- t *= w;
- t *= gamma(a + b) / (gamma(a) * gamma(b));
- if (flag) {
- t = t <= Constants.MACHEP ? 1.0 - Constants.MACHEP : 1.0 - t;
- }
- return t;
- }
- /* Resort to logarithms. */
- y += t + logGamma(a + b) - logGamma(a) - logGamma(b);
- y += Math.log(w / a);
- t = y < Constants.MINLOG ? 0.0 : Math.exp(y);
-
- if (flag) {
- t = t <= Constants.MACHEP ? 1.0 - Constants.MACHEP : 1.0 - t;
- }
- return t;
- }
-
- /** Continued fraction expansion #1 for incomplete beta integral; formerly named <tt>incbcf</tt>. */
- static double incompleteBetaFraction1(double a, double b, double x) {
-
- double k1 = a;
- double k2 = a + b;
- double k3 = a;
- double k4 = a + 1.0;
- double k5 = 1.0;
- double k6 = b - 1.0;
- double k7 = k4;
- double k8 = a + 2.0;
-
- double pkm2 = 0.0;
- double qkm2 = 1.0;
- double pkm1 = 1.0;
- double qkm1 = 1.0;
- double ans = 1.0;
- double r = 1.0;
- int n = 0;
- double thresh = 3.0 * Constants.MACHEP;
- do {
- double xk = -(x * k1 * k2) / (k3 * k4);
- double pk = pkm1 + pkm2 * xk;
- double qk = qkm1 + qkm2 * xk;
- pkm2 = pkm1;
- pkm1 = pk;
- qkm2 = qkm1;
- qkm1 = qk;
-
- xk = (x * k5 * k6) / (k7 * k8);
- pk = pkm1 + pkm2 * xk;
- qk = qkm1 + qkm2 * xk;
- pkm2 = pkm1;
- pkm1 = pk;
- qkm2 = qkm1;
- qkm1 = qk;
-
- if (qk != 0) {
- r = pk / qk;
- }
- double t;
- if (r != 0) {
- t = Math.abs((ans - r) / r);
- ans = r;
- } else {
- t = 1.0;
- }
-
- if (t < thresh) {
- return ans;
- }
-
- k1 += 1.0;
- k2 += 1.0;
- k3 += 2.0;
- k4 += 2.0;
- k5 += 1.0;
- k6 -= 1.0;
- k7 += 2.0;
- k8 += 2.0;
-
- if ((Math.abs(qk) + Math.abs(pk)) > Constants.BIG) {
- pkm2 *= Constants.BIG_INVERSE;
- pkm1 *= Constants.BIG_INVERSE;
- qkm2 *= Constants.BIG_INVERSE;
- qkm1 *= Constants.BIG_INVERSE;
- }
- if ((Math.abs(qk) < Constants.BIG_INVERSE) || (Math.abs(pk) < Constants.BIG_INVERSE)) {
- pkm2 *= Constants.BIG;
- pkm1 *= Constants.BIG;
- qkm2 *= Constants.BIG;
- qkm1 *= Constants.BIG;
- }
- } while (++n < 300);
-
- return ans;
- }
-
- /** Continued fraction expansion #2 for incomplete beta integral; formerly named <tt>incbd</tt>. */
- static double incompleteBetaFraction2(double a, double b, double x) {
-
- double k1 = a;
- double k2 = b - 1.0;
- double k3 = a;
- double k4 = a + 1.0;
- double k5 = 1.0;
- double k6 = a + b;
- double k7 = a + 1.0;
- double k8 = a + 2.0;
-
- double pkm2 = 0.0;
- double qkm2 = 1.0;
- double pkm1 = 1.0;
- double qkm1 = 1.0;
- double z = x / (1.0 - x);
- double ans = 1.0;
- double r = 1.0;
- int n = 0;
- double thresh = 3.0 * Constants.MACHEP;
- do {
- double xk = -(z * k1 * k2) / (k3 * k4);
- double pk = pkm1 + pkm2 * xk;
- double qk = qkm1 + qkm2 * xk;
- pkm2 = pkm1;
- pkm1 = pk;
- qkm2 = qkm1;
- qkm1 = qk;
-
- xk = (z * k5 * k6) / (k7 * k8);
- pk = pkm1 + pkm2 * xk;
- qk = qkm1 + qkm2 * xk;
- pkm2 = pkm1;
- pkm1 = pk;
- qkm2 = qkm1;
- qkm1 = qk;
-
- if (qk != 0) {
- r = pk / qk;
- }
- double t;
- if (r != 0) {
- t = Math.abs((ans - r) / r);
- ans = r;
- } else {
- t = 1.0;
- }
-
- if (t < thresh) {
- return ans;
- }
-
- k1 += 1.0;
- k2 -= 1.0;
- k3 += 2.0;
- k4 += 2.0;
- k5 += 1.0;
- k6 += 1.0;
- k7 += 2.0;
- k8 += 2.0;
-
- if ((Math.abs(qk) + Math.abs(pk)) > Constants.BIG) {
- pkm2 *= Constants.BIG_INVERSE;
- pkm1 *= Constants.BIG_INVERSE;
- qkm2 *= Constants.BIG_INVERSE;
- qkm1 *= Constants.BIG_INVERSE;
- }
- if ((Math.abs(qk) < Constants.BIG_INVERSE) || (Math.abs(pk) < Constants.BIG_INVERSE)) {
- pkm2 *= Constants.BIG;
- pkm1 *= Constants.BIG;
- qkm2 *= Constants.BIG;
- qkm1 *= Constants.BIG;
- }
- } while (++n < 300);
-
- return ans;
- }
-
- /**
- * Returns the Incomplete Gamma function; formerly named <tt>igamma</tt>.
- *
- * @param alpha the shape parameter of the gamma distribution.
- * @param x the integration end point.
- * @return The value of the unnormalized incomplete gamma function.
- */
- public static double incompleteGamma(double alpha, double x) {
- if (x <= 0 || alpha <= 0) {
- return 0.0;
- }
-
- if (x > 1.0 && x > alpha) {
- return 1.0 - incompleteGammaComplement(alpha, x);
- }
-
- /* Compute x**a * exp(-x) / gamma(a) */
- double ax = alpha * Math.log(x) - x - logGamma(alpha);
- if (ax < -Constants.MAXLOG) {
- return 0.0;
- }
-
- ax = Math.exp(ax);
-
- /* power series */
- double r = alpha;
- double c = 1.0;
- double ans = 1.0;
-
- do {
- r += 1.0;
- c *= x / r;
- ans += c;
- }
- while (c / ans > Constants.MACHEP);
-
- return ans * ax / alpha;
-
- }
-
- /**
- * Returns the Complemented Incomplete Gamma function; formerly named <tt>igamc</tt>.
- *
- * @param alpha the shape parameter of the gamma distribution.
- * @param x the integration start point.
- */
- public static double incompleteGammaComplement(double alpha, double x) {
-
- if (x <= 0 || alpha <= 0) {
- return 1.0;
- }
-
- if (x < 1.0 || x < alpha) {
- return 1.0 - incompleteGamma(alpha, x);
- }
-
- double ax = alpha * Math.log(x) - x - logGamma(alpha);
- if (ax < -Constants.MAXLOG) {
- return 0.0;
- }
-
- ax = Math.exp(ax);
-
- /* continued fraction */
- double y = 1.0 - alpha;
- double z = x + y + 1.0;
- double c = 0.0;
- double pkm2 = 1.0;
- double qkm2 = x;
- double pkm1 = x + 1.0;
- double qkm1 = z * x;
- double ans = pkm1 / qkm1;
-
- double t;
- do {
- c += 1.0;
- y += 1.0;
- z += 2.0;
- double yc = y * c;
- double pk = pkm1 * z - pkm2 * yc;
- double qk = qkm1 * z - qkm2 * yc;
- if (qk != 0) {
- double r = pk / qk;
- t = Math.abs((ans - r) / r);
- ans = r;
- } else {
- t = 1.0;
- }
-
- pkm2 = pkm1;
- pkm1 = pk;
- qkm2 = qkm1;
- qkm1 = qk;
- if (Math.abs(pk) > Constants.BIG) {
- pkm2 *= Constants.BIG_INVERSE;
- pkm1 *= Constants.BIG_INVERSE;
- qkm2 *= Constants.BIG_INVERSE;
- qkm1 *= Constants.BIG_INVERSE;
- }
- } while (t > Constants.MACHEP);
-
- return ans * ax;
- }
-
- /** Returns the natural logarithm of the gamma function; formerly named <tt>lgamma</tt>. */
- public static double logGamma(double x) {
- double p;
- double q;
- double z;
-
- double[] aCoefficient = {
- 8.11614167470508450300E-4,
- -5.95061904284301438324E-4,
- 7.93650340457716943945E-4,
- -2.77777777730099687205E-3,
- 8.33333333333331927722E-2
- };
- double[] bCoefficient = {
- -1.37825152569120859100E3,
- -3.88016315134637840924E4,
- -3.31612992738871184744E5,
- -1.16237097492762307383E6,
- -1.72173700820839662146E6,
- -8.53555664245765465627E5
- };
- double[] cCoefficient = {
- /* 1.00000000000000000000E0, */
- -3.51815701436523470549E2,
- -1.70642106651881159223E4,
- -2.20528590553854454839E5,
- -1.13933444367982507207E6,
- -2.53252307177582951285E6,
- -2.01889141433532773231E6
- };
-
- if (x < -34.0) {
- q = -x;
- double w = logGamma(q);
- p = Math.floor(q);
- if (p == q) {
- throw new ArithmeticException("lgam: Overflow");
- }
- z = q - p;
- if (z > 0.5) {
- p += 1.0;
- z = p - q;
- }
- z = q * Math.sin(Math.PI * z);
- if (z == 0.0) {
- throw new
- ArithmeticException("lgamma: Overflow");
- }
- z = Constants.LOGPI - Math.log(z) - w;
- return z;
- }
-
- if (x < 13.0) {
- z = 1.0;
- while (x >= 3.0) {
- x -= 1.0;
- z *= x;
- }
- while (x < 2.0) {
- if (x == 0.0) {
- throw new ArithmeticException("lgamma: Overflow");
- }
- z /= x;
- x += 1.0;
- }
- if (z < 0.0) {
- z = -z;
- }
- if (x == 2.0) {
- return Math.log(z);
- }
- x -= 2.0;
- p = x * Polynomial.polevl(x, bCoefficient, 5) / Polynomial.p1evl(x, cCoefficient, 6);
- return Math.log(z) + p;
- }
-
- if (x > 2.556348e305) {
- throw new ArithmeticException("lgamma: Overflow");
- }
-
- q = (x - 0.5) * Math.log(x) - x + 0.91893853320467274178;
- //if ( x > 1.0e8 ) return( q );
- if (x > 1.0e8) {
- return q;
- }
-
- p = 1.0 / (x * x);
- if (x >= 1000.0) {
- q += ((7.9365079365079365079365e-4 * p
- - 2.7777777777777777777778e-3) * p
- + 0.0833333333333333333333) / x;
- } else {
- q += Polynomial.polevl(p, aCoefficient, 4) / x;
- }
- return q;
- }
-
- /**
- * Power series for incomplete beta integral; formerly named <tt>pseries</tt>. Use when b*x is small and x not too
- * close to 1.
- */
- private static double powerSeries(double a, double b, double x) {
-
- double ai = 1.0 / a;
- double u = (1.0 - b) * x;
- double v = u / (a + 1.0);
- double t1 = v;
- double t = u;
- double n = 2.0;
- double s = 0.0;
- double z = Constants.MACHEP * ai;
- while (Math.abs(v) > z) {
- u = (n - b) * x / n;
- t *= u;
- v = t / (a + n);
- s += v;
- n += 1.0;
- }
- s += t1;
- s += ai;
-
- u = a * Math.log(x);
- if ((a + b) < Constants.MAXGAM && Math.abs(u) < Constants.MAXLOG) {
- t = gamma(a + b) / (gamma(a) * gamma(b));
- s *= t * Math.pow(x, a);
- } else {
- t = logGamma(a + b) - logGamma(a) - logGamma(b) + u + Math.log(s);
- s = t < Constants.MINLOG ? 0.0 : Math.exp(t);
- }
- return s;
- }
-
- /**
- * Returns the Gamma function computed by Stirling's formula; formerly named <tt>stirf</tt>. The polynomial STIR is
- * valid for 33 <= x <= 172.
- */
- static double stirlingFormula(double x) {
- double[] coefficients = {
- 7.87311395793093628397E-4,
- -2.29549961613378126380E-4,
- -2.68132617805781232825E-3,
- 3.47222221605458667310E-3,
- 8.33333333333482257126E-2,
- };
-
- double w = 1.0 / x;
- double y = Math.exp(x);
-
- w = 1.0 + w * Polynomial.polevl(w, coefficients, 4);
-
- if (x > MAXSTIR) {
- /* Avoid overflow in Math.pow() */
- double v = Math.pow(x, 0.5 * x - 0.25);
- y = v * (v / y);
- } else {
- y = Math.pow(x, x - 0.5) / y;
- }
- y = Constants.SQTPI * y * w;
- return y;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/stat/Probability.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/stat/Probability.java b/math/src/main/java/org/apache/mahout/math/jet/stat/Probability.java
deleted file mode 100644
index bcd1a86..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/stat/Probability.java
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.jet.stat;
-
-import org.apache.mahout.math.jet.random.Normal;
-
-/** Partially deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */
-public final class Probability {
-
- private static final Normal UNIT_NORMAL = new Normal(0, 1, null);
-
- private Probability() {
- }
-
- /**
- * Returns the area from zero to <tt>x</tt> under the beta density function.
- * <pre>
- * x
- * - -
- * | (a+b) | | a-1 b-1
- * P(x) = ---------- | t (1-t) dt
- * - - | |
- * | (a) | (b) -
- * 0
- * </pre>
- * This function is identical to the incomplete beta integral function <tt>Gamma.incompleteBeta(a, b, x)</tt>.
- *
- * The complemented function is
- *
- * <tt>1 - P(1-x) = Gamma.incompleteBeta( b, a, x )</tt>;
- */
- public static double beta(double a, double b, double x) {
- return Gamma.incompleteBeta(a, b, x);
- }
-
- /**
- * Returns the integral from zero to <tt>x</tt> of the gamma probability density function.
- * <pre>
- *
- * alpha - x
- * beta | alpha-1 -beta t
- * y = --------- | t e dt
- * - |
- * | (alpha) - 0
- * </pre>
- * The incomplete gamma integral is used, according to the relation
- *
- * <tt>y = Gamma.incompleteGamma( alpha, beta*x )</tt>.
- *
- * See http://en.wikipedia.org/wiki/Gamma_distribution#Probability_density_function
- *
- * @param alpha the shape parameter of the gamma distribution.
- * @param beta the rate parameter of the gamma distribution.
- * @param x integration end point.
- */
- public static double gamma(double alpha, double beta, double x) {
- if (x < 0.0) {
- return 0.0;
- }
- return Gamma.incompleteGamma(alpha, beta * x);
- }
-
- /**
- * Returns the sum of the terms <tt>0</tt> through <tt>k</tt> of the Negative Binomial Distribution.
- * {@code
- * k
- * -- ( n+j-1 ) n j
- * > ( ) p (1-p)
- * -- ( j )
- * j=0
- * }
- * In a sequence of Bernoulli trials, this is the probability that <tt>k</tt> or fewer failures precede the
- * <tt>n</tt>-th success. The terms are not computed individually; instead the incomplete beta integral is
- * employed, according to the formula <tt>y = negativeBinomial( k, n, p ) = Gamma.incompleteBeta( n, k+1, p
- * )</tt>.
- *
- * All arguments must be positive,
- *
- * @param k end term.
- * @param n the number of trials.
- * @param p the probability of success (must be in <tt>(0.0,1.0)</tt>).
- */
- public static double negativeBinomial(int k, int n, double p) {
- if (p < 0.0 || p > 1.0) {
- throw new IllegalArgumentException();
- }
- if (k < 0) {
- return 0.0;
- }
-
- return Gamma.incompleteBeta(n, k + 1, p);
- }
-
- /**
- * Returns the area under the Normal (Gaussian) probability density function, integrated from minus infinity to
- * <tt>x</tt> (assumes mean is zero, variance is one).
- * {@code
- * x
- * -
- * 1 | | 2
- * normal(x) = --------- | exp( - t /2 ) dt
- * sqrt(2pi) | |
- * -
- * -inf.
- *
- * = ( 1 + erf(z) ) / 2
- * = erfc(z) / 2
- * }
- * where <tt>z = x/sqrt(2)</tt>. Computation is via the functions <tt>errorFunction</tt> and
- * <tt>errorFunctionComplement</tt>.
- * 
- * Computed using method 26.2.17 from Abramovitz and Stegun (see http://www.math.sfu.ca/~cbm/aands/page_932.htm
- * and http://en.wikipedia.org/wiki/Normal_distribution#Numerical_approximations_of_the_normal_cdf
- */
-
- public static double normal(double a) {
- if (a < 0) {
- return 1 - normal(-a);
- }
- double b0 = 0.2316419;
- double b1 = 0.319381530;
- double b2 = -0.356563782;
- double b3 = 1.781477937;
- double b4 = -1.821255978;
- double b5 = 1.330274429;
- double t = 1 / (1 + b0 * a);
- return 1 - UNIT_NORMAL.pdf(a) * t * (b1 + t * (b2 + t * (b3 + t * (b4 + t * b5))));
- }
-
- /**
- * Returns the area under the Normal (Gaussian) probability density function, integrated from minus infinity to
- * <tt>x</tt>.
- * {@code
- * x
- * -
- * 1 | | 2
- * normal(x) = --------- | exp( - (t-mean) / 2v ) dt
- * sqrt(2pi*v)| |
- * -
- * -inf.
- *
- * }
- * where <tt>v = variance</tt>. Computation is via the functions <tt>errorFunction</tt>.
- *
- * @param mean the mean of the normal distribution.
- * @param variance the variance of the normal distribution.
- * @param x the integration limit.
- */
- public static double normal(double mean, double variance, double x) {
- return normal((x - mean) / Math.sqrt(variance));
- }
-
- /**
- * Returns the sum of the first <tt>k</tt> terms of the Poisson distribution.
- * <pre>
- * k j
- * -- -m m
- * > e --
- * -- j!
- * j=0
- * </pre>
- * The terms are not summed directly; instead the incomplete gamma integral is employed, according to the relation 
- * <tt>y = poisson( k, m ) = Gamma.incompleteGammaComplement( k+1, m )</tt>.
- *
- * The arguments must both be positive.
- *
- * @param k number of terms.
- * @param mean the mean of the poisson distribution.
- */
- public static double poisson(int k, double mean) {
- if (mean < 0) {
- throw new IllegalArgumentException();
- }
- if (k < 0) {
- return 0.0;
- }
- return Gamma.incompleteGammaComplement(k + 1, mean);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/jet/stat/package-info.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/jet/stat/package-info.java b/math/src/main/java/org/apache/mahout/math/jet/stat/package-info.java
deleted file mode 100644
index 1d4d7bd..0000000
--- a/math/src/main/java/org/apache/mahout/math/jet/stat/package-info.java
+++ /dev/null
@@ -1,5 +0,0 @@
-/**
- * Tools for basic and advanced statistics: Estimators, Gamma functions, Beta functions, Probabilities,
- * Special integrals, etc.
- */
-package org.apache.mahout.math.jet.stat;

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/list/AbstractList.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/list/AbstractList.java b/math/src/main/java/org/apache/mahout/math/list/AbstractList.java
deleted file mode 100644
index c672f40..0000000
--- a/math/src/main/java/org/apache/mahout/math/list/AbstractList.java
+++ /dev/null
@@ -1,247 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
- /*
-Copyright ï¿½ 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.list;
-
-import org.apache.mahout.math.PersistentObject;
-
-/**
- * Abstract base class for resizable lists holding objects or primitive data types such as
- * {@code int}, {@code float}, etc.
- * First see the <a href="package-summary.html">package summary</a> and javadoc
- * <a href="package-tree.html">tree view</a> to get the broad picture.
- * 
- * Note that this implementation is not synchronized.
- *
- * @author ***@cern.ch
- * @version 1.0, 09/24/99
- * @see java.util.ArrayList
- * @see java.util.Vector
- * @see java.util.Arrays
- */
-public abstract class AbstractList extends PersistentObject {
-
- public abstract int size();
-
- public boolean isEmpty() {
- return size() == 0;
- }
-
- /**
- * Inserts <tt>length</tt> dummy elements before the specified position into the receiver. Shifts the element
- * currently at that position (if any) and any subsequent elements to the right. This method must set the new size
- * to be <tt>size()+length</tt>.
- *
- * @param index index before which to insert dummy elements (must be in [0,size])..
- * @param length number of dummy elements to be inserted.
- * @throws IndexOutOfBoundsException if <tt>index < 0 || index > size()</tt>.
- */
- protected abstract void beforeInsertDummies(int index, int length);
-
- /** Checks if the given index is in range. */
- protected static void checkRange(int index, int theSize) {
- if (index >= theSize || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + theSize);
- }
- }
-
- /**
- * Checks if the given range is within the contained array's bounds.
- *
- * @throws IndexOutOfBoundsException if <tt>to!=from-1 || from<0 || from>to || to>=size()</tt>.
- */
- protected static void checkRangeFromTo(int from, int to, int theSize) {
- if (to == from - 1) {
- return;
- }
- if (from < 0 || from > to || to >= theSize) {
- throw new IndexOutOfBoundsException("from: " + from + ", to: " + to + ", size=" + theSize);
- }
- }
-
- /**
- * Removes all elements from the receiver. The receiver will be empty after this call returns, but keep its current
- * capacity.
- */
- public void clear() {
- removeFromTo(0, size() - 1);
- }
-
- /**
- * Sorts the receiver into ascending order. This sort is guaranteed to be stable: equal elements will not be
- * reordered as a result of the sort.
- *
- * The sorting algorithm is a modified mergesort (in which the merge is omitted if the highest element in the low
- * sublist is less than the lowest element in the high sublist). This algorithm offers guaranteed n*log(n)
- * performance, and can approach linear performance on nearly sorted lists.
- *
- * You should never call this method unless you are sure that this particular sorting algorithm is the right one
- * for your data set. It is generally better to call <tt>sort()</tt> or <tt>sortFromTo(...)</tt> instead, because
- * those methods automatically choose the best sorting algorithm.
- */
- public final void mergeSort() {
- mergeSortFromTo(0, size() - 1);
- }
-
- /**
- * Sorts the receiver into ascending order. This sort is guaranteed to be stable: equal elements will not be
- * reordered as a result of the sort.
- *
- * The sorting algorithm is a modified mergesort (in which the merge is omitted if the highest element in the low
- * sublist is less than the lowest element in the high sublist). This algorithm offers guaranteed n*log(n)
- * performance, and can approach linear performance on nearly sorted lists.
- *
- * You should never call this method unless you are sure that this particular sorting algorithm is the right one
- * for your data set. It is generally better to call <tt>sort()</tt> or <tt>sortFromTo(...)</tt> instead, because
- * those methods automatically choose the best sorting algorithm.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @throws IndexOutOfBoundsException if <tt>(from<0 || from>to || to>=size()) && to!=from-1</tt>.
- */
- public abstract void mergeSortFromTo(int from, int to);
-
- /**
- * Sorts the receiver into ascending order. The sorting algorithm is a tuned quicksort, adapted from Jon L. Bentley
- * and M. Douglas McIlroy's "Engineering a Sort Function", Software-Practice and Experience, Vol. 23(11) P. 1249-1265
- * (November 1993). This algorithm offers n*log(n) performance on many data sets that cause other quicksorts to
- * degrade to quadratic performance.
- *
- * You should never call this method unless you are sure that this particular sorting algorithm is the right one
- * for your data set. It is generally better to call <tt>sort()</tt> or <tt>sortFromTo(...)</tt> instead, because
- * those methods automatically choose the best sorting algorithm.
- */
- public final void quickSort() {
- quickSortFromTo(0, size() - 1);
- }
-
- /**
- * Sorts the specified range of the receiver into ascending order. The sorting algorithm is a tuned quicksort,
- * adapted from Jon L. Bentley and M. Douglas McIlroy's "Engineering a Sort Function", Software-Practice and
- * Experience, Vol. 23(11) P. 1249-1265 (November 1993). This algorithm offers n*log(n) performance on many data sets
- * that cause other quicksorts to degrade to quadratic performance.
- *
- * You should never call this method unless you are sure that this particular sorting algorithm is the right one
- * for your data set. It is generally better to call <tt>sort()</tt> or <tt>sortFromTo(...)</tt> instead, because
- * those methods automatically choose the best sorting algorithm.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @throws IndexOutOfBoundsException if <tt>(from<0 || from>to || to>=size()) && to!=from-1</tt>.
- */
- public abstract void quickSortFromTo(int from, int to);
-
- /**
- * Removes the element at the specified position from the receiver. Shifts any subsequent elements to the left.
- *
- * @param index the index of the element to removed.
- * @throws IndexOutOfBoundsException if <tt>index < 0 || index >= size()</tt>.
- */
- public void remove(int index) {
- removeFromTo(index, index);
- }
-
- /**
- * Removes from the receiver all elements whose index is between <code>from</code>, inclusive and <code>to</code>,
- * inclusive. Shifts any succeeding elements to the left (reduces their index). This call shortens the list by
- * <tt>(to - from + 1)</tt> elements.
- *
- * @param fromIndex index of first element to be removed.
- * @param toIndex index of last element to be removed.
- * @throws IndexOutOfBoundsException if <tt>(from<0 || from>to || to>=size()) && to!=from-1</tt>.
- */
- public abstract void removeFromTo(int fromIndex, int toIndex);
-
- /** Reverses the elements of the receiver. Last becomes first, second last becomes second first, and so on. */
- public abstract void reverse();
-
- /**
- * Sets the size of the receiver. If the new size is greater than the current size, new null or zero items are added
- * to the end of the receiver. If the new size is less than the current size, all components at index newSize and
- * greater are discarded. This method does not release any superfluos internal memory. Use method <tt>trimToSize</tt>
- * to release superfluos internal memory.
- *
- * @param newSize the new size of the receiver.
- * @throws IndexOutOfBoundsException if <tt>newSize < 0</tt>.
- */
- public void setSize(int newSize) {
- if (newSize < 0) {
- throw new IndexOutOfBoundsException("newSize:" + newSize);
- }
-
- int currentSize = size();
- if (newSize != currentSize) {
- if (newSize > currentSize) {
- beforeInsertDummies(currentSize, newSize - currentSize);
- } else if (newSize < currentSize) {
- removeFromTo(newSize, currentSize - 1);
- }
- }
- }
-
- /**
- * Sorts the receiver into ascending order.
- *
- * The sorting algorithm is dynamically chosen according to the characteristics of the data set.
- *
- * This implementation simply calls <tt>sortFromTo(...)</tt>. Override <tt>sortFromTo(...)</tt> if you can determine
- * which sort is most appropriate for the given data set.
- */
- public final void sort() {
- sortFromTo(0, size() - 1);
- }
-
- /**
- * Sorts the specified range of the receiver into ascending order.
- *
- * The sorting algorithm is dynamically chosen according to the characteristics of the data set. This default
- * implementation simply calls quickSort. Override this method if you can determine which sort is most appropriate for
- * the given data set.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @throws IndexOutOfBoundsException if <tt>(from<0 || from>to || to>=size()) && to!=from-1</tt>.
- */
- public void sortFromTo(int from, int to) {
- quickSortFromTo(from, to);
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluos internal memory. An
- * application can use this operation to minimize the storage of the receiver. This default implementation does
- * nothing. Override this method in space efficient implementations.
- */
- public void trimToSize() {
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/list/AbstractObjectList.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/list/AbstractObjectList.java b/math/src/main/java/org/apache/mahout/math/list/AbstractObjectList.java
deleted file mode 100644
index a1a5899..0000000
--- a/math/src/main/java/org/apache/mahout/math/list/AbstractObjectList.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.list;
-
-import java.util.Collection;
-
-/**
- Abstract base class for resizable lists holding objects or primitive data types such as <code>int</code>,
- <code>float</code>, etc.First see the <a href="package-summary.html">package summary</a> and
- javadoc <a href="package-tree.html">tree view</a> to get the broad picture.
- 
- Note that this implementation is not synchronized.
-
- @author ***@cern.ch
- @version 1.0, 09/24/99
- @see java.util.ArrayList
- @see java.util.Vector
- @see java.util.Arrays
- */
-public abstract class AbstractObjectList<T> extends AbstractList {
-
- /**
- * Appends all of the elements of the specified Collection to the receiver.
- *
- * @throws ClassCastException if an element in the collection is not of the same parameter type of the receiver.
- */
- public void addAllOf(Collection<T> collection) {
- this.beforeInsertAllOf(size(), collection);
- }
-
- /**
- * Inserts all elements of the specified collection before the specified position into the receiver. Shifts the
- * element currently at that position (if any) and any subsequent elements to the right (increases their indices).
- *
- * @param index index before which to insert first element from the specified collection.
- * @param collection the collection to be inserted
- * @throws ClassCastException if an element in the collection is not of the same parameter type of the
- * receiver.
- * @throws IndexOutOfBoundsException if <tt>index < 0 || index > size()</tt>.
- */
- public void beforeInsertAllOf(int index, Collection<T> collection) {
- this.beforeInsertDummies(index, collection.size());
- this.replaceFromWith(index, collection);
- }
-
- /**
- * Replaces the part of the receiver starting at <code>from</code> (inclusive) with all the elements of the specified
- * collection. Does not alter the size of the receiver. Replaces exactly <tt>Math.max(0,Math.min(size()-from,
- * other.size()))</tt> elements.
- *
- * @param from the index at which to copy the first element from the specified collection.
- * @param other Collection to replace part of the receiver
- * @throws IndexOutOfBoundsException if <tt>index < 0 || index >= size()</tt>.
- */
- public abstract void replaceFromWith(int from, Collection<T> other);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/list/ObjectArrayList.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/list/ObjectArrayList.java b/math/src/main/java/org/apache/mahout/math/list/ObjectArrayList.java
deleted file mode 100644
index c41141f..0000000
--- a/math/src/main/java/org/apache/mahout/math/list/ObjectArrayList.java
+++ /dev/null
@@ -1,419 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.list;
-
-import org.apache.mahout.math.function.ObjectProcedure;
-
-import java.util.Collection;
-
-/**
- Resizable list holding <code>${valueType}</code> elements; implemented with arrays.
-*/
-
-public class ObjectArrayList<T> extends AbstractObjectList<T> {
-
- /**
- * The array buffer into which the elements of the list are stored. The capacity of the list is the length of this
- * array buffer.
- */
- private Object[] elements;
- private int size;
-
- /** Constructs an empty list. */
- public ObjectArrayList() {
- this(10);
- }
-
- /**
- * Constructs a list containing the specified elements. The initial size and capacity of the list is the length of the
- * array.
- *
- * WARNING: For efficiency reasons and to keep memory usage low, the array is not copied. So if
- * subsequently you modify the specified array directly via the [] operator, be sure you know what you're doing.
- *
- * @param elements the array to be backed by the the constructed list
- */
- public ObjectArrayList(T[] elements) {
- elements(elements);
- }
-
- /**
- * Constructs an empty list with the specified initial capacity.
- *
- * @param initialCapacity the number of elements the receiver can hold without auto-expanding itself by allocating new
- * internal memory.
- */
- @SuppressWarnings("unchecked")
- public ObjectArrayList(int initialCapacity) {
- elements = new Object[initialCapacity];
- size = 0;
- }
-
- /**
- * Appends the specified element to the end of this list.
- *
- * @param element element to be appended to this list.
- */
- public void add(T element) {
- // overridden for performance only.
- if (size == elements.length) {
- ensureCapacity(size + 1);
- }
- elements[size++] = element;
- }
-
- /**
- * Inserts the specified element before the specified position into the receiver. Shifts the element currently at that
- * position (if any) and any subsequent elements to the right.
- *
- * @param index index before which the specified element is to be inserted (must be in [0,size]).
- * @param element element to be inserted.
- * @throws IndexOutOfBoundsException index is out of range (<tt>index < 0 || index > size()</tt>).
- */
- public void beforeInsert(int index, T element) {
- // overridden for performance only.
- if (size == index) {
- add(element);
- return;
- }
- if (index > size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- ensureCapacity(size + 1);
- System.arraycopy(elements, index, elements, index + 1, size - index);
- elements[index] = element;
- size++;
- }
-
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @SuppressWarnings("unchecked")
- @Override
- public Object clone() {
- // overridden for performance only.
- return new ObjectArrayList<>((T[]) elements.clone());
- }
-
- /**
- * Returns a deep copy of the receiver; uses <code>clone()</code> and casts the result.
- *
- * @return a deep copy of the receiver.
- */
- @SuppressWarnings("unchecked")
- public ObjectArrayList<T> copy() {
- return (ObjectArrayList<T>) clone();
- }
-
- /**
- * Returns the elements currently stored, including invalid elements between size and capacity, if any.
- *
- * WARNING: For efficiency reasons and to keep memory usage low, the array is not copied. So if
- * subsequently you modify the returned array directly via the [] operator, be sure you know what you're doing.
- *
- * @return the elements currently stored.
- */
- @SuppressWarnings("unchecked")
- public <Q> Q[] elements() {
- return (Q[])elements;
- }
-
- /**
- * Sets the receiver's elements to be the specified array (not a copy of it).
- *
- * The size and capacity of the list is the length of the array. WARNING: For efficiency reasons and to keep
- * memory usage low, the array is not copied. So if subsequently you modify the specified array directly via
- * the [] operator, be sure you know what you're doing.
- *
- * @param elements the new elements to be stored.
- */
- public void elements(T[] elements) {
- this.elements = elements;
- this.size = elements.length;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of elements without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public void ensureCapacity(int minCapacity) {
- elements = org.apache.mahout.math.Arrays.ensureCapacity(elements, minCapacity);
- }
-
- /**
- * Compares the specified Object with the receiver. Returns true if and only if the specified Object is also an
- * ArrayList of the same type, both Lists have the same size, and all corresponding pairs of elements in the two Lists
- * are identical. In other words, two Lists are defined to be equal if they contain the same elements in the same
- * order.
- *
- * @param otherObj the Object to be compared for equality with the receiver.
- * @return true if the specified Object is equal to the receiver.
- */
- @Override
- @SuppressWarnings("unchecked")
- public boolean equals(Object otherObj) { //delta
- // overridden for performance only.
- if (!(otherObj instanceof ObjectArrayList)) {
- return super.equals(otherObj);
- }
- if (this == otherObj) {
- return true;
- }
- if (otherObj == null) {
- return false;
- }
- ObjectArrayList<?> other = (ObjectArrayList<?>) otherObj;
- if (size() != other.size()) {
- return false;
- }
-
- Object[] theElements = elements();
- Object[] otherElements = other.elements();
- for (int i = size(); --i >= 0;) {
- if (theElements[i] != otherElements[i]) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Applies a procedure to each element of the receiver, if any. Starts at index 0, moving rightwards.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all elements where iterated over, <tt>true</tt> otherwise.
- */
- @SuppressWarnings("unchecked")
- public boolean forEach(ObjectProcedure<T> procedure) {
- T[] theElements = (T[]) elements;
- int theSize = size;
-
- for (int i = 0; i < theSize;) {
- if (!procedure.apply(theElements[i++])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the element at the specified position in the receiver.
- *
- * @param index index of element to return.
- * @throws IndexOutOfBoundsException index is out of range (index < 0 || index >= size()).
- */
- @SuppressWarnings("unchecked")
- public T get(int index) {
- // overridden for performance only.
- if (index >= size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- return (T) elements[index];
- }
-
- /**
- * Returns the element at the specified position in the receiver; WARNING: Does not check preconditions.
- * Provided with invalid parameters this method may return invalid elements without throwing any exception! You
- * should only use this method when you are absolutely sure that the index is within bounds. Precondition
- * (unchecked): <tt>index >= 0 && index < size()</tt>.
- *
- * @param index index of element to return.
- */
- @SuppressWarnings("unchecked")
- public T getQuick(int index) {
- return (T) elements[index];
- }
-
- /**
- * Returns the index of the first occurrence of the specified element. Returns <code>-1</code> if the receiver does
- * not contain this element. Searches between <code>from</code>, inclusive and <code>to</code>, inclusive. Tests for
- * identity.
- *
- * @param element element to search for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return the index of the first occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- public int indexOfFromTo(T element, int from, int to) {
- // overridden for performance only.
- if (size == 0) {
- return -1;
- }
- checkRangeFromTo(from, to, size);
-
- Object[] theElements = elements;
- for (int i = from; i <= to; i++) {
- if (element == theElements[i]) {
- return i;
- } //found
- }
- return -1; //not found
- }
-
- /**
- * Returns the index of the last occurrence of the specified element. Returns <code>-1</code> if the receiver does not
- * contain this element. Searches beginning at <code>to</code>, inclusive until <code>from</code>, inclusive. Tests
- * for identity.
- *
- * @param element element to search for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return the index of the last occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- public int lastIndexOfFromTo(T element, int from, int to) {
- // overridden for performance only.
- if (size == 0) {
- return -1;
- }
- checkRangeFromTo(from, to, size);
-
- Object[] theElements = elements;
- for (int i = to; i >= from; i--) {
- if (element == theElements[i]) {
- return i;
- } //found
- }
- return -1; //not found
- }
-
- /**
- * Returns a new list of the part of the receiver between <code>from</code>, inclusive, and <code>to</code>,
- * inclusive.
- *
- * @param from the index of the first element (inclusive).
- * @param to the index of the last element (inclusive).
- * @return a new list
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @SuppressWarnings("unchecked")
- public AbstractObjectList<T> partFromTo(int from, int to) {
- if (size == 0) {
- return new ObjectArrayList<>(0);
- }
-
- checkRangeFromTo(from, to, size);
-
- Object[] part = new Object[to - from + 1];
- System.arraycopy(elements, from, part, 0, to - from + 1);
- return new ObjectArrayList<>((T[]) part);
- }
-
- /** Reverses the elements of the receiver. Last becomes first, second last becomes second first, and so on. */
- @Override
- public void reverse() {
- // overridden for performance only.
- int limit = size / 2;
- int j = size - 1;
-
- Object[] theElements = elements;
- for (int i = 0; i < limit;) { //swap
- Object tmp = theElements[i];
- theElements[i++] = theElements[j];
- theElements[j--] = tmp;
- }
- }
-
- /**
- * Replaces the element at the specified position in the receiver with the specified element.
- *
- * @param index index of element to replace.
- * @param element element to be stored at the specified position.
- * @throws IndexOutOfBoundsException index is out of range (index < 0 || index >= size()).
- */
- public void set(int index, T element) {
- // overridden for performance only.
- if (index >= size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- elements[index] = element;
- }
-
- /**
- * Replaces the element at the specified position in the receiver with the specified element; WARNING: Does not
- * check preconditions. Provided with invalid parameters this method may access invalid indexes without throwing any
- * exception! You should only use this method when you are absolutely sure that the index is within bounds.
- * Precondition (unchecked): {@code index >= 0 && index < size()}.
- *
- * @param index index of element to replace.
- * @param element element to be stored at the specified position.
- */
- public void setQuick(int index, T element) {
- elements[index] = element;
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- elements = org.apache.mahout.math.Arrays.trimToCapacity(elements, size());
- }
-
- @Override
- public void removeFromTo(int fromIndex, int toIndex) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void replaceFromWith(int from, Collection<T> other) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- protected void beforeInsertDummies(int index, int length) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void mergeSortFromTo(int from, int to) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void quickSortFromTo(int from, int to) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int size() {
- return size;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/list/SimpleLongArrayList.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/list/SimpleLongArrayList.java b/math/src/main/java/org/apache/mahout/math/list/SimpleLongArrayList.java
deleted file mode 100644
index 1a765eb..0000000
--- a/math/src/main/java/org/apache/mahout/math/list/SimpleLongArrayList.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.list;
-
-/**
- Resizable list holding <code>long</code> elements; implemented with arrays; not efficient; just to
- demonstrate which methods you must override to implement a fully functional list.
- */
-public class SimpleLongArrayList extends AbstractLongList {
-
- /**
- * The array buffer into which the elements of the list are stored. The capacity of the list is the length of this
- * array buffer.
- */
- private long[] elements;
-
- /** Constructs an empty list. */
- public SimpleLongArrayList() {
- this(10);
- }
-
- /**
- * Constructs a list containing the specified elements. The initial size and capacity of the list is the length of the
- * array.
- *
- * WARNING: For efficiency reasons and to keep memory usage low, the array is not copied. So if
- * subsequently you modify the specified array directly via the [] operator, be sure you know what you're doing.
- *
- * @param elements the array to be backed by the the constructed list
- */
- public SimpleLongArrayList(long[] elements) {
- elements(elements);
- }
-
- /**
- * Constructs an empty list with the specified initial capacity.
- *
- * @param initialCapacity the number of elements the receiver can hold without auto-expanding itself by allocating new
- * internal memory.
- */
- private SimpleLongArrayList(int initialCapacity) {
- if (initialCapacity < 0) {
- throw new IllegalArgumentException("Illegal Capacity: " + initialCapacity);
- }
-
- this.elements(new long[initialCapacity]);
- size = 0;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of elements without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver.
- *
- * @param minCapacity the desired minimum capacity.
- */
- @Override
- public void ensureCapacity(int minCapacity) {
- elements = org.apache.mahout.math.Arrays.ensureCapacity(elements, minCapacity);
- }
-
- /**
- * Returns the element at the specified position in the receiver; WARNING: Does not check preconditions.
- * Provided with invalid parameters this method may return invalid elements without throwing any exception! You
- * should only use this method when you are absolutely sure that the index is within bounds. Precondition
- * (unchecked): <tt>index >= 0 && index < size()</tt>.
- *
- * @param index index of element to return.
- */
- @Override
- protected long getQuick(int index) {
- return elements[index];
- }
-
- /**
- * Replaces the element at the specified position in the receiver with the specified element; WARNING: Does not
- * check preconditions. Provided with invalid parameters this method may access invalid indexes without throwing any
- * exception! You should only use this method when you are absolutely sure that the index is within bounds.
- * Precondition (unchecked): <tt>index >= 0 && index < size()</tt>.
- *
- * @param index index of element to replace.
- * @param element element to be stored at the specified position.
- */
- @Override
- protected void setQuick(int index, long element) {
- elements[index] = element;
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. An application can use this operation to
- * minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- elements = org.apache.mahout.math.Arrays.trimToCapacity(elements, size());
- }
-}

r***@apache.org

2018-06-27 14:51:35 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/VectorView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/VectorView.java b/math/src/main/java/org/apache/mahout/math/VectorView.java
deleted file mode 100644
index 62c5490..0000000
--- a/math/src/main/java/org/apache/mahout/math/VectorView.java
+++ /dev/null
@@ -1,238 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Iterator;
-
-import com.google.common.collect.AbstractIterator;
-
-/** Implements subset view of a Vector */
-public class VectorView extends AbstractVector {
-
- protected Vector vector;
-
- // the offset into the Vector
- protected int offset;
-
- /** For serialization purposes only */
- public VectorView() {
- super(0);
- }
-
- public VectorView(Vector vector, int offset, int cardinality) {
- super(cardinality);
- this.vector = vector;
- this.offset = offset;
- }
-
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- return ((AbstractVector) vector).matrixLike(rows, columns);
- }
-
- @Override
- public Vector clone() {
- VectorView r = (VectorView) super.clone();
- r.vector = vector.clone();
- r.offset = offset;
- return r;
- }
-
- @Override
- public boolean isDense() {
- return vector.isDense();
- }
-
- @Override
- public boolean isSequentialAccess() {
- return vector.isSequentialAccess();
- }
-
- @Override
- public VectorView like() {
- return new VectorView(vector.like(), offset, size());
- }
-
- @Override
- public Vector like(int cardinality) {
- return vector.like(cardinality);
- }
-
- @Override
- public double getQuick(int index) {
- return vector.getQuick(offset + index);
- }
-
- @Override
- public void setQuick(int index, double value) {
- vector.setQuick(offset + index, value);
- }
-
- @Override
- public int getNumNondefaultElements() {
- return size();
- }
-
- @Override
- public Vector viewPart(int offset, int length) {
- if (offset < 0) {
- throw new IndexException(offset, size());
- }
- if (offset + length > size()) {
- throw new IndexException(offset + length, size());
- }
- return new VectorView(vector, offset + this.offset, length);
- }
-
- /** @return true if index is a valid index in the underlying Vector */
- private boolean isInView(int index) {
- return index >= offset && index < offset + size();
- }
-
- @Override
- public Iterator<Element> iterateNonZero() {
- return new NonZeroIterator();
- }
-
- @Override
- public Iterator<Element> iterator() {
- return new AllIterator();
- }
-
- public final class NonZeroIterator extends AbstractIterator<Element> {
-
- private final Iterator<Element> it;
-
- private NonZeroIterator() {
- it = vector.nonZeroes().iterator();
- }
-
- @Override
- protected Element computeNext() {
- while (it.hasNext()) {
- Element el = it.next();
- if (isInView(el.index()) && el.get() != 0) {
- Element decorated = el; /* vector.getElement(el.index()); */
- return new DecoratorElement(decorated);
- }
- }
- return endOfData();
- }
-
- }
-
- public final class AllIterator extends AbstractIterator<Element> {
-
- private final Iterator<Element> it;
-
- private AllIterator() {
- it = vector.all().iterator();
- }
-
- @Override
- protected Element computeNext() {
- while (it.hasNext()) {
- Element el = it.next();
- if (isInView(el.index())) {
- Element decorated = vector.getElement(el.index());
- return new DecoratorElement(decorated);
- }
- }
- return endOfData(); // No element was found
- }
-
- }
-
- private final class DecoratorElement implements Element {
-
- private final Element decorated;
-
- private DecoratorElement(Element decorated) {
- this.decorated = decorated;
- }
-
- @Override
- public double get() {
- return decorated.get();
- }
-
- @Override
- public int index() {
- return decorated.index() - offset;
- }
-
- @Override
- public void set(double value) {
- decorated.set(value);
- }
- }
-
- @Override
- public double getLengthSquared() {
- double result = 0.0;
- int size = size();
- for (int i = 0; i < size; i++) {
- double value = getQuick(i);
- result += value * value;
- }
- return result;
- }
-
- @Override
- public double getDistanceSquared(Vector v) {
- double result = 0.0;
- int size = size();
- for (int i = 0; i < size; i++) {
- double delta = getQuick(i) - v.getQuick(i);
- result += delta * delta;
- }
- return result;
- }
-
- @Override
- public double getLookupCost() {
- return vector.getLookupCost();
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- // TODO: remove the 2x after fixing the Element iterator
- return 2 * vector.getIteratorAdvanceCost();
- }
-
- @Override
- public boolean isAddConstantTime() {
- return vector.isAddConstantTime();
- }
-
- /**
- * Used internally by assign() to update multiple indices and values at once.
- * Only really useful for sparse vectors (especially SequentialAccessSparseVector).
- * 
- * If someone ever adds a new type of sparse vectors, this method must merge (index, value) pairs into the vector.
- *
- * @param updates a mapping of indices to values to merge in the vector.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- for (int i = 0; i < updates.getNumMappings(); ++i) {
- updates.setIndexAt(i, updates.indexAt(i) + offset);
- }
- vector.mergeUpdates(updates);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/WeightedVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/WeightedVector.java b/math/src/main/java/org/apache/mahout/math/WeightedVector.java
deleted file mode 100644
index c8fdfac..0000000
--- a/math/src/main/java/org/apache/mahout/math/WeightedVector.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-/**
- * Decorates a vector with a floating point weight and an index.
- */
-public class WeightedVector extends DelegatingVector {
- private static final int INVALID_INDEX = -1;
- private double weight;
- private int index;
-
- protected WeightedVector(double weight, int index) {
- super();
- this.weight = weight;
- this.index = index;
- }
-
- public WeightedVector(Vector v, double weight, int index) {
- super(v);
- this.weight = weight;
- this.index = index;
- }
-
- public WeightedVector(Vector v, Vector projection, int index) {
- super(v);
- this.index = index;
- this.weight = v.dot(projection);
- }
-
- public static WeightedVector project(Vector v, Vector projection) {
- return project(v, projection, INVALID_INDEX);
- }
-
- public static WeightedVector project(Vector v, Vector projection, int index) {
- return new WeightedVector(v, projection, index);
- }
-
- public double getWeight() {
- return weight;
- }
-
- public int getIndex() {
- return index;
- }
-
- public void setWeight(double newWeight) {
- this.weight = newWeight;
- }
-
- public void setIndex(int index) {
- this.index = index;
- }
-
- @Override
- public Vector like() {
- return new WeightedVector(getVector().like(), weight, index);
- }
-
- @Override
- public String toString() {
- return String.format("index=%d, weight=%.2f, v=%s", index, weight, getVector());
- }
-
- @Override
- public WeightedVector clone() {
- WeightedVector v = (WeightedVector)super.clone();
- v.weight = weight;
- v.index = index;
- return v;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/WeightedVectorComparator.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/WeightedVectorComparator.java b/math/src/main/java/org/apache/mahout/math/WeightedVectorComparator.java
deleted file mode 100644
index 9fdd621..0000000
--- a/math/src/main/java/org/apache/mahout/math/WeightedVectorComparator.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.Serializable;
-import java.util.Comparator;
-
-/**
- * Orders {@link WeightedVector} by {@link WeightedVector#getWeight()}.
- */
-public final class WeightedVectorComparator implements Comparator<WeightedVector>, Serializable {
-
- private static final double DOUBLE_EQUALITY_ERROR = 1.0e-8;
-
- @Override
- public int compare(WeightedVector a, WeightedVector b) {
- if (a == b) {
- return 0;
- }
- double aWeight = a.getWeight();
- double bWeight = b.getWeight();
- int r = Double.compare(aWeight, bWeight);
- if (r != 0 && Math.abs(aWeight - bWeight) >= DOUBLE_EQUALITY_ERROR) {
- return r;
- }
- double diff = a.minus(b).norm(1);
- if (diff < 1.0e-12) {
- return 0;
- }
- for (Vector.Element element : a.all()) {
- r = Double.compare(element.get(), b.get(element.index()));
- if (r != 0) {
- return r;
- }
- }
- return 0;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/als/AlternatingLeastSquaresSolver.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/als/AlternatingLeastSquaresSolver.java b/math/src/main/java/org/apache/mahout/math/als/AlternatingLeastSquaresSolver.java
deleted file mode 100644
index dbe1f8b..0000000
--- a/math/src/main/java/org/apache/mahout/math/als/AlternatingLeastSquaresSolver.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.als;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterables;
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.QRDecomposition;
-import org.apache.mahout.math.Vector;
-
-/**
- * See
- * <a href="http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf">
- * this paper.</a>
- */
-public final class AlternatingLeastSquaresSolver {
-
- private AlternatingLeastSquaresSolver() {}
-
- //TODO make feature vectors a simple array
- public static Vector solve(Iterable<Vector> featureVectors, Vector ratingVector, double lambda, int numFeatures) {
-
- Preconditions.checkNotNull(featureVectors, "Feature Vectors cannot be null");
- Preconditions.checkArgument(!Iterables.isEmpty(featureVectors));
- Preconditions.checkNotNull(ratingVector, "Rating Vector cannot be null");
- Preconditions.checkArgument(ratingVector.getNumNondefaultElements() > 0, "Rating Vector cannot be empty");
- Preconditions.checkArgument(Iterables.size(featureVectors) == ratingVector.getNumNondefaultElements());
-
- int nui = ratingVector.getNumNondefaultElements();
-
- Matrix MiIi = createMiIi(featureVectors, numFeatures);
- Matrix RiIiMaybeTransposed = createRiIiMaybeTransposed(ratingVector);
-
- /* compute Ai = MiIi * t(MiIi) + lambda * nui * E */
- Matrix Ai = miTimesMiTransposePlusLambdaTimesNuiTimesE(MiIi, lambda, nui);
- /* compute Vi = MiIi * t(R(i,Ii)) */
- Matrix Vi = MiIi.times(RiIiMaybeTransposed);
- /* compute Ai * ui = Vi */
- return solve(Ai, Vi);
- }
-
- private static Vector solve(Matrix Ai, Matrix Vi) {
- return new QRDecomposition(Ai).solve(Vi).viewColumn(0);
- }
-
- static Matrix addLambdaTimesNuiTimesE(Matrix matrix, double lambda, int nui) {
- Preconditions.checkArgument(matrix.numCols() == matrix.numRows(), "Must be a Square Matrix");
- double lambdaTimesNui = lambda * nui;
- int numCols = matrix.numCols();
- for (int n = 0; n < numCols; n++) {
- matrix.setQuick(n, n, matrix.getQuick(n, n) + lambdaTimesNui);
- }
- return matrix;
- }
-
- private static Matrix miTimesMiTransposePlusLambdaTimesNuiTimesE(Matrix MiIi, double lambda, int nui) {
-
- double lambdaTimesNui = lambda * nui;
- int rows = MiIi.numRows();
-
- double[][] result = new double[rows][rows];
-
- for (int i = 0; i < rows; i++) {
- for (int j = i; j < rows; j++) {
- double dot = MiIi.viewRow(i).dot(MiIi.viewRow(j));
- if (i != j) {
- result[i][j] = dot;
- result[j][i] = dot;
- } else {
- result[i][i] = dot + lambdaTimesNui;
- }
- }
- }
- return new DenseMatrix(result, true);
- }
-
-
- static Matrix createMiIi(Iterable<Vector> featureVectors, int numFeatures) {
- double[][] MiIi = new double[numFeatures][Iterables.size(featureVectors)];
- int n = 0;
- for (Vector featureVector : featureVectors) {
- for (int m = 0; m < numFeatures; m++) {
- MiIi[m][n] = featureVector.getQuick(m);
- }
- n++;
- }
- return new DenseMatrix(MiIi, true);
- }
-
- static Matrix createRiIiMaybeTransposed(Vector ratingVector) {
- Preconditions.checkArgument(ratingVector.isSequentialAccess(), "Ratings should be iterable in Index or Sequential Order");
-
- double[][] RiIiMaybeTransposed = new double[ratingVector.getNumNondefaultElements()][1];
- int index = 0;
- for (Vector.Element elem : ratingVector.nonZeroes()) {
- RiIiMaybeTransposed[index++][0] = elem.get();
- }
- return new DenseMatrix(RiIiMaybeTransposed, true);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/als/ImplicitFeedbackAlternatingLeastSquaresSolver.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/als/ImplicitFeedbackAlternatingLeastSquaresSolver.java b/math/src/main/java/org/apache/mahout/math/als/ImplicitFeedbackAlternatingLeastSquaresSolver.java
deleted file mode 100644
index 28bf4b4..0000000
--- a/math/src/main/java/org/apache/mahout/math/als/ImplicitFeedbackAlternatingLeastSquaresSolver.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.als;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.QRDecomposition;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.Vector.Element;
-import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.math.list.IntArrayList;
-import org.apache.mahout.math.map.OpenIntObjectHashMap;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.base.Preconditions;
-
-/** see <a href="http://research.yahoo.com/pub/2433">Collaborative Filtering for Implicit Feedback Datasets</a> */
-public class ImplicitFeedbackAlternatingLeastSquaresSolver {
-
- private final int numFeatures;
- private final double alpha;
- private final double lambda;
- private final int numTrainingThreads;
-
- private final OpenIntObjectHashMap<Vector> Y;
- private final Matrix YtransposeY;
-
- private static final Logger log = LoggerFactory.getLogger(ImplicitFeedbackAlternatingLeastSquaresSolver.class);
-
- public ImplicitFeedbackAlternatingLeastSquaresSolver(int numFeatures, double lambda, double alpha,
- OpenIntObjectHashMap<Vector> Y, int numTrainingThreads) {
- this.numFeatures = numFeatures;
- this.lambda = lambda;
- this.alpha = alpha;
- this.Y = Y;
- this.numTrainingThreads = numTrainingThreads;
- YtransposeY = getYtransposeY(Y);
- }
-
- public Vector solve(Vector ratings) {
- return solve(YtransposeY.plus(getYtransponseCuMinusIYPlusLambdaI(ratings)), getYtransponseCuPu(ratings));
- }
-
- private static Vector solve(Matrix A, Matrix y) {
- return new QRDecomposition(A).solve(y).viewColumn(0);
- }
-
- double confidence(double rating) {
- return 1 + alpha * rating;
- }
-
- /* Y' Y */
- public Matrix getYtransposeY(final OpenIntObjectHashMap<Vector> Y) {
-
- ExecutorService queue = Executors.newFixedThreadPool(numTrainingThreads);
- if (log.isInfoEnabled()) {
- log.info("Starting the computation of Y'Y");
- }
- long startTime = System.nanoTime();
- final IntArrayList indexes = Y.keys();
- final int numIndexes = indexes.size();
-
- final double[][] YtY = new double[numFeatures][numFeatures];
-
- // Compute Y'Y by dot products between the 'columns' of Y
- for (int i = 0; i < numFeatures; i++) {
- for (int j = i; j < numFeatures; j++) {
-
- final int ii = i;
- final int jj = j;
- queue.execute(new Runnable() {
- @Override
- public void run() {
- double dot = 0;
- for (int k = 0; k < numIndexes; k++) {
- Vector row = Y.get(indexes.getQuick(k));
- dot += row.getQuick(ii) * row.getQuick(jj);
- }
- YtY[ii][jj] = dot;
- if (ii != jj) {
- YtY[jj][ii] = dot;
- }
- }
- });
-
- }
- }
- queue.shutdown();
- try {
- queue.awaitTermination(1, TimeUnit.DAYS);
- } catch (InterruptedException e) {
- log.error("Error during Y'Y queue shutdown", e);
- throw new RuntimeException("Error during Y'Y queue shutdown");
- }
- if (log.isInfoEnabled()) {
- log.info("Computed Y'Y in " + (System.nanoTime() - startTime) / 1000000.0 + " ms" );
- }
- return new DenseMatrix(YtY, true);
- }
-
- /** Y' (Cu - I) Y + λ I */
- private Matrix getYtransponseCuMinusIYPlusLambdaI(Vector userRatings) {
- Preconditions.checkArgument(userRatings.isSequentialAccess(), "need sequential access to ratings!");
-
- /* (Cu -I) Y */
- OpenIntObjectHashMap<Vector> CuMinusIY = new OpenIntObjectHashMap<>(userRatings.getNumNondefaultElements());
- for (Element e : userRatings.nonZeroes()) {
- CuMinusIY.put(e.index(), Y.get(e.index()).times(confidence(e.get()) - 1));
- }
-
- Matrix YtransponseCuMinusIY = new DenseMatrix(numFeatures, numFeatures);
-
- /* Y' (Cu -I) Y by outer products */
- for (Element e : userRatings.nonZeroes()) {
- for (Vector.Element feature : Y.get(e.index()).all()) {
- Vector partial = CuMinusIY.get(e.index()).times(feature.get());
- YtransponseCuMinusIY.viewRow(feature.index()).assign(partial, Functions.PLUS);
- }
- }
-
- /* Y' (Cu - I) Y + λ I add lambda on the diagonal */
- for (int feature = 0; feature < numFeatures; feature++) {
- YtransponseCuMinusIY.setQuick(feature, feature, YtransponseCuMinusIY.getQuick(feature, feature) + lambda);
- }
-
- return YtransponseCuMinusIY;
- }
-
- /** Y' Cu p(u) */
- private Matrix getYtransponseCuPu(Vector userRatings) {
- Preconditions.checkArgument(userRatings.isSequentialAccess(), "need sequential access to ratings!");
-
- Vector YtransponseCuPu = new DenseVector(numFeatures);
-
- for (Element e : userRatings.nonZeroes()) {
- YtransponseCuPu.assign(Y.get(e.index()).times(confidence(e.get())), Functions.PLUS);
- }
-
- return columnVectorAsMatrix(YtransponseCuPu);
- }
-
- private Matrix columnVectorAsMatrix(Vector v) {
- double[][] matrix = new double[numFeatures][1];
- for (Vector.Element e : v.all()) {
- matrix[e.index()][0] = e.get();
- }
- return new DenseMatrix(matrix, true);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/AsyncEigenVerifier.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/AsyncEigenVerifier.java b/math/src/main/java/org/apache/mahout/math/decomposer/AsyncEigenVerifier.java
deleted file mode 100644
index 0233848..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/AsyncEigenVerifier.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer;
-
-import java.io.Closeable;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorIterable;
-
-public class AsyncEigenVerifier extends SimpleEigenVerifier implements Closeable {
-
- private final ExecutorService threadPool;
- private EigenStatus status;
- private boolean finished;
- private boolean started;
-
- public AsyncEigenVerifier() {
- threadPool = Executors.newFixedThreadPool(1);
- status = new EigenStatus(-1, 0);
- }
-
- @Override
- public synchronized EigenStatus verify(VectorIterable corpus, Vector vector) {
- if (!finished && !started) { // not yet started or finished, so start!
- status = new EigenStatus(-1, 0);
- Vector vectorCopy = vector.clone();
- threadPool.execute(new VerifierRunnable(corpus, vectorCopy));
- started = true;
- }
- if (finished) {
- finished = false;
- }
- return status;
- }
-
- @Override
- public void close() {
- this.threadPool.shutdownNow();
- }
- protected EigenStatus innerVerify(VectorIterable corpus, Vector vector) {
- return super.verify(corpus, vector);
- }
-
- private class VerifierRunnable implements Runnable {
- private final VectorIterable corpus;
- private final Vector vector;
-
- protected VerifierRunnable(VectorIterable corpus, Vector vector) {
- this.corpus = corpus;
- this.vector = vector;
- }
-
- @Override
- public void run() {
- EigenStatus status = innerVerify(corpus, vector);
- synchronized (AsyncEigenVerifier.this) {
- AsyncEigenVerifier.this.status = status;
- finished = true;
- started = false;
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/EigenStatus.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/EigenStatus.java b/math/src/main/java/org/apache/mahout/math/decomposer/EigenStatus.java
deleted file mode 100644
index a284f50..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/EigenStatus.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer;
-
-public class EigenStatus {
- private final double eigenValue;
- private final double cosAngle;
- private volatile Boolean inProgress;
-
- public EigenStatus(double eigenValue, double cosAngle) {
- this(eigenValue, cosAngle, true);
- }
-
- public EigenStatus(double eigenValue, double cosAngle, boolean inProgress) {
- this.eigenValue = eigenValue;
- this.cosAngle = cosAngle;
- this.inProgress = inProgress;
- }
-
- public double getCosAngle() {
- return cosAngle;
- }
-
- public double getEigenValue() {
- return eigenValue;
- }
-
- public boolean inProgress() {
- return inProgress;
- }
-
- void setInProgress(boolean status) {
- inProgress = status;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/SimpleEigenVerifier.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/SimpleEigenVerifier.java b/math/src/main/java/org/apache/mahout/math/decomposer/SimpleEigenVerifier.java
deleted file mode 100644
index 71aaa30..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/SimpleEigenVerifier.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.decomposer;
-
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorIterable;
-
-public class SimpleEigenVerifier implements SingularVectorVerifier {
-
- @Override
- public EigenStatus verify(VectorIterable corpus, Vector vector) {
- Vector resultantVector = corpus.timesSquared(vector);
- double newNorm = resultantVector.norm(2);
- double oldNorm = vector.norm(2);
- double eigenValue;
- double cosAngle;
- if (newNorm > 0 && oldNorm > 0) {
- eigenValue = newNorm / oldNorm;
- cosAngle = resultantVector.dot(vector) / newNorm * oldNorm;
- } else {
- eigenValue = 1.0;
- cosAngle = 0.0;
- }
- return new EigenStatus(eigenValue, cosAngle, false);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/SingularVectorVerifier.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/SingularVectorVerifier.java b/math/src/main/java/org/apache/mahout/math/decomposer/SingularVectorVerifier.java
deleted file mode 100644
index a9a7af8..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/SingularVectorVerifier.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer;
-
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorIterable;
-
-public interface SingularVectorVerifier {
- EigenStatus verify(VectorIterable eigenMatrix, Vector vector);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/EigenUpdater.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/EigenUpdater.java b/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/EigenUpdater.java
deleted file mode 100644
index ac9cc41..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/EigenUpdater.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer.hebbian;
-
-import org.apache.mahout.math.Vector;
-
-
-public interface EigenUpdater {
- void update(Vector pseudoEigen, Vector trainingVector, TrainingState currentState);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java b/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java
deleted file mode 100644
index 5b5cc9b..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java
+++ /dev/null
@@ -1,342 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer.hebbian;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Properties;
-import java.util.Random;
-
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.decomposer.AsyncEigenVerifier;
-import org.apache.mahout.math.decomposer.EigenStatus;
-import org.apache.mahout.math.decomposer.SingularVectorVerifier;
-import org.apache.mahout.math.function.PlusMult;
-import org.apache.mahout.math.function.TimesFunction;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * The Hebbian solver is an iterative, sparse, singular value decomposition solver, based on the paper
- * <a href="http://www.dcs.shef.ac.uk/~genevieve/gorrell_webb.pdf">Generalized Hebbian Algorithm for
- * Latent Semantic Analysis</a> (2005) by Genevieve Gorrell and Brandyn Webb (a.k.a. Simon Funk).
- * TODO: more description here! For now: read the inline comments, and the comments for the constructors.
- */
-public class HebbianSolver {
-
- private static final Logger log = LoggerFactory.getLogger(HebbianSolver.class);
- private static final boolean DEBUG = false;
-
- private final EigenUpdater updater;
- private final SingularVectorVerifier verifier;
- private final double convergenceTarget;
- private final int maxPassesPerEigen;
- private final Random rng = RandomUtils.getRandom();
-
- private int numPasses = 0;
-
- /**
- * Creates a new HebbianSolver
- *
- * @param updater
- * {@link EigenUpdater} used to do the actual work of iteratively updating the current "best guess"
- * singular vector one data-point presentation at a time.
- * @param verifier
- * {@link SingularVectorVerifier } an object which perpetually tries to check how close to
- * convergence the current singular vector is (typically is a
- * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } which does this
- * in the background in another thread, while the main thread continues to converge)
- * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
- * angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
- * corpus
- * @param maxPassesPerEigen a cutoff which tells the solver after how many times of checking for convergence (done
- * by the verifier) should the solver stop trying, even if it has not reached the convergenceTarget.
- */
- public HebbianSolver(EigenUpdater updater,
- SingularVectorVerifier verifier,
- double convergenceTarget,
- int maxPassesPerEigen) {
- this.updater = updater;
- this.verifier = verifier;
- this.convergenceTarget = convergenceTarget;
- this.maxPassesPerEigen = maxPassesPerEigen;
- }
-
- /**
- * Creates a new HebbianSolver with maxPassesPerEigen = Integer.MAX_VALUE (i.e. keep on iterating until
- * convergenceTarget is reached). Not recommended unless only looking for
- * the first few (5, maybe 10?) singular
- * vectors, as small errors which compound early on quickly put a minimum error on subsequent vectors.
- *
- * @param updater {@link EigenUpdater} used to do the actual work of iteratively updating the current "best guess"
- * singular vector one data-point presentation at a time.
- * @param verifier {@link org.apache.mahout.math.decomposer.SingularVectorVerifier }
- * an object which perpetually tries to check how close to
- * convergence the current singular vector is (typically is a
- * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } which does this
- * in the background in another thread, while the main thread continues to converge)
- * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
- * angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
- * corpus
- */
- public HebbianSolver(EigenUpdater updater,
- SingularVectorVerifier verifier,
- double convergenceTarget) {
- this(updater,
- verifier,
- convergenceTarget,
- Integer.MAX_VALUE);
- }
-
- /**
- * This is the recommended constructor to use if you're not sure
- * Creates a new HebbianSolver with the default {@link HebbianUpdater } to do the updating work, and the default
- * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } to check for convergence in a
- * (single) background thread.
- *
- * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
- * angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
- * corpus
- * @param maxPassesPerEigen a cutoff which tells the solver after how many times of checking for convergence (done
- * by the verifier) should the solver stop trying, even if it has not reached the convergenceTarget.
- */
- public HebbianSolver(double convergenceTarget, int maxPassesPerEigen) {
- this(new HebbianUpdater(),
- new AsyncEigenVerifier(),
- convergenceTarget,
- maxPassesPerEigen);
- }
-
- /**
- * Creates a new HebbianSolver with the default {@link HebbianUpdater } to do the updating work, and the default
- * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } to check for convergence in a (single)
- * background thread, with
- * maxPassesPerEigen set to Integer.MAX_VALUE. Not recommended unless only looking
- * for the first few (5, maybe 10?) singular
- * vectors, as small errors which compound early on quickly put a minimum error on subsequent vectors.
- *
- * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
- * angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
- * corpus
- */
- public HebbianSolver(double convergenceTarget) {
- this(convergenceTarget, Integer.MAX_VALUE);
- }
-
- /**
- * Creates a new HebbianSolver with the default {@link HebbianUpdater } to do the updating work, and the default
- * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } to check for convergence in a (single)
- * background thread, with
- * convergenceTarget set to 0, which means that the solver will not really care about convergence as a loop-exiting
- * criterion (but will be checking for convergence anyways, so it will be logged and singular values will be
- * saved).
- *
- * @param numPassesPerEigen the exact number of times the verifier will check convergence status in the background
- * before the solver will move on to the next eigen-vector.
- */
- public HebbianSolver(int numPassesPerEigen) {
- this(0.0, numPassesPerEigen);
- }
-
- /**
- * Primary singular vector solving method.
- *
- * @param corpus input matrix to find singular vectors of. Needs not be symmetric, should probably be sparse (in
- * fact the input vectors are not mutated, and accessed only via dot-products and sums, so they should be
- * {@link org.apache.mahout.math.SequentialAccessSparseVector }
- * @param desiredRank the number of singular vectors to find (in roughly decreasing order by singular value)
- * @return the final {@link TrainingState } of the solver, after desiredRank singular vectors (and approximate
- * singular values) have been found.
- */
- public TrainingState solve(Matrix corpus,
- int desiredRank) {
- int cols = corpus.numCols();
- Matrix eigens = new DenseMatrix(desiredRank, cols);
- List<Double> eigenValues = new ArrayList<>();
- log.info("Finding {} singular vectors of matrix with {} rows, via Hebbian", desiredRank, corpus.numRows());
- /*
- * The corpusProjections matrix is a running cache of the residual projection of each corpus vector against all
- * of the previously found singular vectors. Without this, if multiple passes over the data is made (per
- * singular vector), recalculating these projections eventually dominates the computational complexity of the
- * solver.
- */
- Matrix corpusProjections = new DenseMatrix(corpus.numRows(), desiredRank);
- TrainingState state = new TrainingState(eigens, corpusProjections);
- for (int i = 0; i < desiredRank; i++) {
- Vector currentEigen = new DenseVector(cols);
- Vector previousEigen = null;
- while (hasNotConverged(currentEigen, corpus, state)) {
- int randomStartingIndex = getRandomStartingIndex(corpus, eigens);
- Vector initialTrainingVector = corpus.viewRow(randomStartingIndex);
- state.setTrainingIndex(randomStartingIndex);
- updater.update(currentEigen, initialTrainingVector, state);
- for (int corpusRow = 0; corpusRow < corpus.numRows(); corpusRow++) {
- state.setTrainingIndex(corpusRow);
- if (corpusRow != randomStartingIndex) {
- updater.update(currentEigen, corpus.viewRow(corpusRow), state);
- }
- }
- state.setFirstPass(false);
- if (DEBUG) {
- if (previousEigen == null) {
- previousEigen = currentEigen.clone();
- } else {
- double dot = currentEigen.dot(previousEigen);
- if (dot > 0.0) {
- dot /= currentEigen.norm(2) * previousEigen.norm(2);
- }
- // log.info("Current pass * previous pass = {}", dot);
- }
- }
- }
- // converged!
- double eigenValue = state.getStatusProgress().get(state.getStatusProgress().size() - 1).getEigenValue();
- // it's actually more efficient to do this to normalize than to call currentEigen = currentEigen.normalize(),
- // because the latter does a clone, which isn't necessary here.
- currentEigen.assign(new TimesFunction(), 1 / currentEigen.norm(2));
- eigens.assignRow(i, currentEigen);
- eigenValues.add(eigenValue);
- state.setCurrentEigenValues(eigenValues);
- log.info("Found eigenvector {}, eigenvalue: {}", i, eigenValue);
-
- /**
- * TODO: Persist intermediate output!
- */
- state.setFirstPass(true);
- state.setNumEigensProcessed(state.getNumEigensProcessed() + 1);
- state.setActivationDenominatorSquared(0);
- state.setActivationNumerator(0);
- state.getStatusProgress().clear();
- numPasses = 0;
- }
- return state;
- }
-
- /**
- * You have to start somewhere...
- * TODO: start instead wherever you find a vector with maximum residual length after subtracting off the projection
- * TODO: onto all previous eigenvectors.
- *
- * @param corpus the corpus matrix
- * @param eigens not currently used, but should be (see above TODO)
- * @return the index into the corpus where the "starting seed" input vector lies.
- */
- private int getRandomStartingIndex(Matrix corpus, Matrix eigens) {
- int index;
- Vector v;
- do {
- double r = rng.nextDouble();
- index = (int) (r * corpus.numRows());
- v = corpus.viewRow(index);
- } while (v == null || v.norm(2) == 0 || v.getNumNondefaultElements() < 5);
- return index;
- }
-
- /**
- * Uses the {@link SingularVectorVerifier } to check for convergence
- *
- * @param currentPseudoEigen the purported singular vector whose convergence is being checked
- * @param corpus the corpus to check against
- * @param state contains the previous eigens, various other solving state {@link TrainingState}
- * @return true if either we have converged, or maxPassesPerEigen has been exceeded.
- */
- protected boolean hasNotConverged(Vector currentPseudoEigen,
- Matrix corpus,
- TrainingState state) {
- numPasses++;
- if (state.isFirstPass()) {
- log.info("First pass through the corpus, no need to check convergence...");
- return true;
- }
- Matrix previousEigens = state.getCurrentEigens();
- log.info("Have made {} passes through the corpus, checking convergence...", numPasses);
- /*
- * Step 1: orthogonalize currentPseudoEigen by subtracting off eigen(i) * helper.get(i)
- * Step 2: zero-out the helper vector because it has already helped.
- */
- for (int i = 0; i < state.getNumEigensProcessed(); i++) {
- Vector previousEigen = previousEigens.viewRow(i);
- currentPseudoEigen.assign(previousEigen, new PlusMult(-state.getHelperVector().get(i)));
- state.getHelperVector().set(i, 0);
- }
- if (currentPseudoEigen.norm(2) > 0) {
- for (int i = 0; i < state.getNumEigensProcessed(); i++) {
- Vector previousEigen = previousEigens.viewRow(i);
- log.info("dot with previous: {}", previousEigen.dot(currentPseudoEigen) / currentPseudoEigen.norm(2));
- }
- }
- /*
- * Step 3: verify how eigen-like the prospective eigen is. This is potentially asynchronous.
- */
- EigenStatus status = verify(corpus, currentPseudoEigen);
- if (status.inProgress()) {
- log.info("Verifier not finished, making another pass...");
- } else {
- log.info("Has 1 - cosAngle: {}, convergence target is: {}", 1.0 - status.getCosAngle(), convergenceTarget);
- state.getStatusProgress().add(status);
- }
- return
- state.getStatusProgress().size() <= maxPassesPerEigen
- && 1.0 - status.getCosAngle() > convergenceTarget;
- }
-
- protected EigenStatus verify(Matrix corpus, Vector currentPseudoEigen) {
- return verifier.verify(corpus, currentPseudoEigen);
- }
-
- public static void main(String[] args) {
- Properties props = new Properties();
- String propertiesFile = args.length > 0 ? args[0] : "config/solver.properties";
- // props.load(new FileInputStream(propertiesFile));
-
- String corpusDir = props.getProperty("solver.input.dir");
- String outputDir = props.getProperty("solver.output.dir");
- if (corpusDir == null || corpusDir.isEmpty() || outputDir == null || outputDir.isEmpty()) {
- log.error("{} must contain values for solver.input.dir and solver.output.dir", propertiesFile);
- return;
- }
- //int inBufferSize = Integer.parseInt(props.getProperty("solver.input.bufferSize"));
- int rank = Integer.parseInt(props.getProperty("solver.output.desiredRank"));
- double convergence = Double.parseDouble(props.getProperty("solver.convergence"));
- int maxPasses = Integer.parseInt(props.getProperty("solver.maxPasses"));
- //int numThreads = Integer.parseInt(props.getProperty("solver.verifier.numThreads"));
-
- HebbianUpdater updater = new HebbianUpdater();
- SingularVectorVerifier verifier = new AsyncEigenVerifier();
- HebbianSolver solver = new HebbianSolver(updater, verifier, convergence, maxPasses);
- Matrix corpus = null;
- /*
- if (numThreads <= 1) {
- // corpus = new DiskBufferedDoubleMatrix(new File(corpusDir), inBufferSize);
- } else {
- // corpus = new ParallelMultiplyingDiskBufferedDoubleMatrix(new File(corpusDir), inBufferSize, numThreads);
- }
- */
- long now = System.currentTimeMillis();
- TrainingState finalState = solver.solve(corpus, rank);
- long time = (System.currentTimeMillis() - now) / 1000;
- log.info("Solved {} eigenVectors in {} seconds. Persisted to {}",
- finalState.getCurrentEigens().rowSize(), time, outputDir);
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianUpdater.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianUpdater.java b/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianUpdater.java
deleted file mode 100644
index 2080c3a..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianUpdater.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer.hebbian;
-
-
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.PlusMult;
-
-public class HebbianUpdater implements EigenUpdater {
-
- @Override
- public void update(Vector pseudoEigen,
- Vector trainingVector,
- TrainingState currentState) {
- double trainingVectorNorm = trainingVector.norm(2);
- int numPreviousEigens = currentState.getNumEigensProcessed();
- if (numPreviousEigens > 0 && currentState.isFirstPass()) {
- updateTrainingProjectionsVector(currentState, trainingVector, numPreviousEigens - 1);
- }
- if (currentState.getActivationDenominatorSquared() == 0 || trainingVectorNorm == 0) {
- if (currentState.getActivationDenominatorSquared() == 0) {
- pseudoEigen.assign(trainingVector, new PlusMult(1));
- currentState.setHelperVector(currentState.currentTrainingProjection().clone());
- double helperNorm = currentState.getHelperVector().norm(2);
- currentState.setActivationDenominatorSquared(trainingVectorNorm * trainingVectorNorm - helperNorm * helperNorm);
- }
- return;
- }
- currentState.setActivationNumerator(pseudoEigen.dot(trainingVector));
- currentState.setActivationNumerator(
- currentState.getActivationNumerator()
- - currentState.getHelperVector().dot(currentState.currentTrainingProjection()));
-
- double activation = currentState.getActivationNumerator()
- / Math.sqrt(currentState.getActivationDenominatorSquared());
- currentState.setActivationDenominatorSquared(
- currentState.getActivationDenominatorSquared()
- + 2 * activation * currentState.getActivationNumerator()
- + activation * activation
- * (trainingVector.getLengthSquared() - currentState.currentTrainingProjection().getLengthSquared()));
- if (numPreviousEigens > 0) {
- currentState.getHelperVector().assign(currentState.currentTrainingProjection(), new PlusMult(activation));
- }
- pseudoEigen.assign(trainingVector, new PlusMult(activation));
- }
-
- private static void updateTrainingProjectionsVector(TrainingState state,
- Vector trainingVector,
- int previousEigenIndex) {
- Vector previousEigen = state.mostRecentEigen();
- Vector currentTrainingVectorProjection = state.currentTrainingProjection();
- double projection = previousEigen.dot(trainingVector);
- currentTrainingVectorProjection.set(previousEigenIndex, projection);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/TrainingState.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/TrainingState.java b/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/TrainingState.java
deleted file mode 100644
index af6c2ef..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/TrainingState.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer.hebbian;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.decomposer.EigenStatus;
-
-public class TrainingState {
-
- private Matrix currentEigens;
- private int numEigensProcessed;
- private List<Double> currentEigenValues;
- private Matrix trainingProjections;
- private int trainingIndex;
- private Vector helperVector;
- private boolean firstPass;
- private List<EigenStatus> statusProgress;
- private double activationNumerator;
- private double activationDenominatorSquared;
-
- TrainingState(Matrix eigens, Matrix projections) {
- currentEigens = eigens;
- trainingProjections = projections;
- trainingIndex = 0;
- helperVector = new DenseVector(eigens.numRows());
- firstPass = true;
- statusProgress = new ArrayList<>();
- activationNumerator = 0;
- activationDenominatorSquared = 0;
- numEigensProcessed = 0;
- }
-
- public Vector mostRecentEigen() {
- return currentEigens.viewRow(numEigensProcessed - 1);
- }
-
- public Vector currentTrainingProjection() {
- if (trainingProjections.viewRow(trainingIndex) == null) {
- trainingProjections.assignRow(trainingIndex, new DenseVector(currentEigens.numCols()));
- }
- return trainingProjections.viewRow(trainingIndex);
- }
-
- public Matrix getCurrentEigens() {
- return currentEigens;
- }
-
- public void setCurrentEigens(Matrix currentEigens) {
- this.currentEigens = currentEigens;
- }
-
- public int getNumEigensProcessed() {
- return numEigensProcessed;
- }
-
- public void setNumEigensProcessed(int numEigensProcessed) {
- this.numEigensProcessed = numEigensProcessed;
- }
-
- public List<Double> getCurrentEigenValues() {
- return currentEigenValues;
- }
-
- public void setCurrentEigenValues(List<Double> currentEigenValues) {
- this.currentEigenValues = currentEigenValues;
- }
-
- public Matrix getTrainingProjections() {
- return trainingProjections;
- }
-
- public void setTrainingProjections(Matrix trainingProjections) {
- this.trainingProjections = trainingProjections;
- }
-
- public int getTrainingIndex() {
- return trainingIndex;
- }
-
- public void setTrainingIndex(int trainingIndex) {
- this.trainingIndex = trainingIndex;
- }
-
- public Vector getHelperVector() {
- return helperVector;
- }
-
- public void setHelperVector(Vector helperVector) {
- this.helperVector = helperVector;
- }
-
- public boolean isFirstPass() {
- return firstPass;
- }
-
- public void setFirstPass(boolean firstPass) {
- this.firstPass = firstPass;
- }
-
- public List<EigenStatus> getStatusProgress() {
- return statusProgress;
- }
-
- public void setStatusProgress(List<EigenStatus> statusProgress) {
- this.statusProgress = statusProgress;
- }
-
- public double getActivationNumerator() {
- return activationNumerator;
- }
-
- public void setActivationNumerator(double activationNumerator) {
- this.activationNumerator = activationNumerator;
- }
-
- public double getActivationDenominatorSquared() {
- return activationDenominatorSquared;
- }
-
- public void setActivationDenominatorSquared(double activationDenominatorSquared) {
- this.activationDenominatorSquared = activationDenominatorSquared;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java b/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
deleted file mode 100644
index 61a77db..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer.lanczos;
-
-
-import java.util.EnumMap;
-import java.util.Map;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorIterable;
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.PlusMult;
-import org.apache.mahout.math.solver.EigenDecomposition;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Simple implementation of the <a href="http://en.wikipedia.org/wiki/Lanczos_algorithm">Lanczos algorithm</a> for
- * finding eigenvalues of a symmetric matrix, applied to non-symmetric matrices by applying Matrix.timesSquared(vector)
- * as the "matrix-multiplication" method.
- *
- * See the SSVD code for a better option
- * {@link org.apache.mahout.math.ssvd.SequentialBigSvd}
- * See also the docs on
- * <a href=https://mahout.apache.org/users/dim-reduction/ssvd.html>stochastic
- * projection SVD</a>
- * 
- * To avoid floating point overflow problems which arise in power-methods like Lanczos, an initial pass is made
- * through the input matrix to
- * <ul>
- * <li>generate a good starting seed vector by summing all the rows of the input matrix, and</li>
- * <li>compute the trace(inputMatrixt*matrix)
- * </ul>
- * 
- * This latter value, being the sum of all of the singular values, is used to rescale the entire matrix, effectively
- * forcing the largest singular value to be strictly less than one, and transforming floating point overflow
- * problems into floating point underflow (ie, very small singular values will become invisible, as they
- * will appear to be zero and the algorithm will terminate).
- * This implementation uses {@link EigenDecomposition} to do the
- * eigenvalue extraction from the small (desiredRank x desiredRank) tridiagonal matrix. Numerical stability is
- * achieved via brute-force: re-orthogonalization against all previous eigenvectors is computed after every pass.
- * This can be made smarter if (when!) this proves to be a major bottleneck. Of course, this step can be parallelized
- * as well.
- * @see org.apache.mahout.math.ssvd.SequentialBigSvd
- */
-@Deprecated
-public class LanczosSolver {
-
- private static final Logger log = LoggerFactory.getLogger(LanczosSolver.class);
-
- public static final double SAFE_MAX = 1.0e150;
-
- public enum TimingSection {
- ITERATE, ORTHOGANLIZE, TRIDIAG_DECOMP, FINAL_EIGEN_CREATE
- }
-
- private final Map<TimingSection, Long> startTimes = new EnumMap<>(TimingSection.class);
- private final Map<TimingSection, Long> times = new EnumMap<>(TimingSection.class);
-
- private static final class Scale extends DoubleFunction {
- private final double d;
-
- private Scale(double d) {
- this.d = d;
- }
-
- @Override
- public double apply(double arg1) {
- return arg1 * d;
- }
- }
-
- public void solve(LanczosState state,
- int desiredRank) {
- solve(state, desiredRank, false);
- }
-
- public void solve(LanczosState state,
- int desiredRank,
- boolean isSymmetric) {
- VectorIterable corpus = state.getCorpus();
- log.info("Finding {} singular vectors of matrix with {} rows, via Lanczos",
- desiredRank, corpus.numRows());
- int i = state.getIterationNumber();
- Vector currentVector = state.getBasisVector(i - 1);
- Vector previousVector = state.getBasisVector(i - 2);
- double beta = 0;
- Matrix triDiag = state.getDiagonalMatrix();
- while (i < desiredRank) {
- startTime(TimingSection.ITERATE);
- Vector nextVector = isSymmetric ? corpus.times(currentVector) : corpus.timesSquared(currentVector);
- log.info("{} passes through the corpus so far...", i);
- if (state.getScaleFactor() <= 0) {
- state.setScaleFactor(calculateScaleFactor(nextVector));
- }
- nextVector.assign(new Scale(1.0 / state.getScaleFactor()));
- if (previousVector != null) {
- nextVector.assign(previousVector, new PlusMult(-beta));
- }
- // now orthogonalize
- double alpha = currentVector.dot(nextVector);
- nextVector.assign(currentVector, new PlusMult(-alpha));
- endTime(TimingSection.ITERATE);
- startTime(TimingSection.ORTHOGANLIZE);
- orthoganalizeAgainstAllButLast(nextVector, state);
- endTime(TimingSection.ORTHOGANLIZE);
- // and normalize
- beta = nextVector.norm(2);
- if (outOfRange(beta) || outOfRange(alpha)) {
- log.warn("Lanczos parameters out of range: alpha = {}, beta = {}. Bailing out early!",
- alpha, beta);
- break;
- }
- nextVector.assign(new Scale(1 / beta));
- state.setBasisVector(i, nextVector);
- previousVector = currentVector;
- currentVector = nextVector;
- // save the projections and norms!
- triDiag.set(i - 1, i - 1, alpha);
- if (i < desiredRank - 1) {
- triDiag.set(i - 1, i, beta);
- triDiag.set(i, i - 1, beta);
- }
- state.setIterationNumber(++i);
- }
- startTime(TimingSection.TRIDIAG_DECOMP);
-
- log.info("Lanczos iteration complete - now to diagonalize the tri-diagonal auxiliary matrix.");
- // at this point, have tridiag all filled out, and basis is all filled out, and orthonormalized
- EigenDecomposition decomp = new EigenDecomposition(triDiag);
-
- Matrix eigenVects = decomp.getV();
- Vector eigenVals = decomp.getRealEigenvalues();
- endTime(TimingSection.TRIDIAG_DECOMP);
- startTime(TimingSection.FINAL_EIGEN_CREATE);
- for (int row = 0; row < i; row++) {
- Vector realEigen = null;
-
- Vector ejCol = eigenVects.viewColumn(row);
- int size = Math.min(ejCol.size(), state.getBasisSize());
- for (int j = 0; j < size; j++) {
- double d = ejCol.get(j);
- Vector rowJ = state.getBasisVector(j);
- if (realEigen == null) {
- realEigen = rowJ.like();
- }
- realEigen.assign(rowJ, new PlusMult(d));
- }
-
- Preconditions.checkState(realEigen != null);
- assert realEigen != null;
-
- realEigen = realEigen.normalize();
- state.setRightSingularVector(row, realEigen);
- double e = eigenVals.get(row) * state.getScaleFactor();
- if (!isSymmetric) {
- e = Math.sqrt(e);
- }
- log.info("Eigenvector {} found with eigenvalue {}", row, e);
- state.setSingularValue(row, e);
- }
- log.info("LanczosSolver finished.");
- endTime(TimingSection.FINAL_EIGEN_CREATE);
- }
-
- protected static double calculateScaleFactor(Vector nextVector) {
- return nextVector.norm(2);
- }
-
- private static boolean outOfRange(double d) {
- return Double.isNaN(d) || d > SAFE_MAX || -d > SAFE_MAX;
- }
-
- protected static void orthoganalizeAgainstAllButLast(Vector nextVector, LanczosState state) {
- for (int i = 0; i < state.getIterationNumber(); i++) {
- Vector basisVector = state.getBasisVector(i);
- double alpha;
- if (basisVector == null || (alpha = nextVector.dot(basisVector)) == 0.0) {
- continue;
- }
- nextVector.assign(basisVector, new PlusMult(-alpha));
- }
- }
-
- private void startTime(TimingSection section) {
- startTimes.put(section, System.nanoTime());
- }
-
- private void endTime(TimingSection section) {
- if (!times.containsKey(section)) {
- times.put(section, 0L);
- }
- times.put(section, times.get(section) + System.nanoTime() - startTimes.get(section));
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosState.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosState.java b/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosState.java
deleted file mode 100644
index 2ba34bd..0000000
--- a/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosState.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decomposer.lanczos;
-
-import com.google.common.collect.Maps;
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorIterable;
-
-import java.util.Map;
-
-@Deprecated
-public class LanczosState {
-
- protected Matrix diagonalMatrix;
- protected final VectorIterable corpus;
- protected double scaleFactor;
- protected int iterationNumber;
- protected final int desiredRank;
- protected Map<Integer, Vector> basis;
- protected final Map<Integer, Double> singularValues;
- protected Map<Integer, Vector> singularVectors;
-
- public LanczosState(VectorIterable corpus, int desiredRank, Vector initialVector) {
- this.corpus = corpus;
- this.desiredRank = desiredRank;
- intitializeBasisAndSingularVectors();
- setBasisVector(0, initialVector);
- scaleFactor = 0;
- diagonalMatrix = new DenseMatrix(desiredRank, desiredRank);
- singularValues = Maps.newHashMap();
- iterationNumber = 1;
- }
-
- private void intitializeBasisAndSingularVectors() {
- basis = Maps.newHashMap();
- singularVectors = Maps.newHashMap();
- }
-
- public Matrix getDiagonalMatrix() {
- return diagonalMatrix;
- }
-
- public int getIterationNumber() {
- return iterationNumber;
- }
-
- public double getScaleFactor() {
- return scaleFactor;
- }
-
- public VectorIterable getCorpus() {
- return corpus;
- }
-
- public Vector getRightSingularVector(int i) {
- return singularVectors.get(i);
- }
-
- public Double getSingularValue(int i) {
- return singularValues.get(i);
- }
-
- public Vector getBasisVector(int i) {
- return basis.get(i);
- }
-
- public int getBasisSize() {
- return basis.size();
- }
-
- public void setBasisVector(int i, Vector basisVector) {
- basis.put(i, basisVector);
- }
-
- public void setScaleFactor(double scale) {
- scaleFactor = scale;
- }
-
- public void setIterationNumber(int i) {
- iterationNumber = i;
- }
-
- public void setRightSingularVector(int i, Vector vector) {
- singularVectors.put(i, vector);
- }
-
- public void setSingularValue(int i, double value) {
- singularValues.put(i, value);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java b/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
deleted file mode 100644
index 1782f04..0000000
--- a/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.flavor;
-
-/**
- * Matrix backends
- */
-public enum BackEnum {
- JVMMEM,
- NETLIB_BLAS
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java b/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
deleted file mode 100644
index e1d93f2..0000000
--- a/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.flavor;
-
-/** A set of matrix structure properties that I denote as "flavor" (by analogy to quarks) */
-public interface MatrixFlavor {
-
- /**
- * Whether matrix is backed by a native system -- such as java memory, lapack/atlas, Magma etc.
- */
- BackEnum getBacking();
-
- /**
- * Structure flavors
- */
- TraversingStructureEnum getStructure() ;
-
- boolean isDense();
-
- /**
- * This default for {@link org.apache.mahout.math.DenseMatrix}-like structures
- */
- MatrixFlavor DENSELIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.ROWWISE, true);
- /**
- * This is default flavor for {@link org.apache.mahout.math.SparseRowMatrix}-like.
- */
- MatrixFlavor SPARSELIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.ROWWISE, false);
-
- /**
- * This is default flavor for {@link org.apache.mahout.math.SparseMatrix}-like structures, i.e. sparse matrix blocks,
- * where few, perhaps most, rows may be missing entirely.
- */
- MatrixFlavor SPARSEROWLIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.SPARSEROWWISE, false);
-
- /**
- * This is default flavor for {@link org.apache.mahout.math.DiagonalMatrix} and the likes.
- */
- MatrixFlavor DIAGONALLIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.VECTORBACKED, false);
-
- final class FlavorImpl implements MatrixFlavor {
- private BackEnum pBacking;
- private TraversingStructureEnum pStructure;
- private boolean pDense;
-
- public FlavorImpl(BackEnum backing, TraversingStructureEnum structure, boolean dense) {
- pBacking = backing;
- pStructure = structure;
- pDense = dense;
- }
-
- @Override
- public BackEnum getBacking() {
- return pBacking;
- }
-
- @Override
- public TraversingStructureEnum getStructure() {
- return pStructure;
- }
-
- @Override
- public boolean isDense() {
- return pDense;
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java b/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
deleted file mode 100644
index 13c2cf4..0000000
--- a/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.flavor;
-
-/** STRUCTURE HINT */
-public enum TraversingStructureEnum {
-
- UNKNOWN,
-
- /**
- * Backing vectors are directly available as row views.
- */
- ROWWISE,
-
- /**
- * Column vectors are directly available as column views.
- */
- COLWISE,
-
- /**
- * Only some row-wise vectors are really present (can use iterateNonEmpty). Corresponds to
- * [[org.apache.mahout.math.SparseMatrix]].
- */
- SPARSEROWWISE,
-
- SPARSECOLWISE,
-
- SPARSEHASH,
-
- VECTORBACKED,
-
- BLOCKIFIED
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/DoubleDoubleFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/DoubleDoubleFunction.java b/math/src/main/java/org/apache/mahout/math/function/DoubleDoubleFunction.java
deleted file mode 100644
index 466ddd6..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/DoubleDoubleFunction.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-package org.apache.mahout.math.function;
-
-/**
- * Interface that represents a function object: a function that takes two arguments and returns a single value.
- **/
-public abstract class DoubleDoubleFunction {
-
- /**
- * Apply the function to the arguments and return the result
- *
- * @param arg1 a double for the first argument
- * @param arg2 a double for the second argument
- * @return the result of applying the function
- */
- public abstract double apply(double arg1, double arg2);
-
- /**
- * @return true iff f(x, 0) = x for any x
- */
- public boolean isLikeRightPlus() {
- return false;
- }
-
- /**
- * @return true iff f(0, y) = 0 for any y
- */
- public boolean isLikeLeftMult() {
- return false;
- }
-
- /**
- * @return true iff f(x, 0) = 0 for any x
- */
- public boolean isLikeRightMult() {
- return false;
- }
-
- /**
- * @return true iff f(x, 0) = f(0, y) = 0 for any x, y
- */
- public boolean isLikeMult() {
- return isLikeLeftMult() && isLikeRightMult();
- }
-
- /**
- * @return true iff f(x, y) = f(y, x) for any x, y
- */
- public boolean isCommutative() {
- return false;
- }
-
- /**
- * @return true iff f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- public boolean isAssociative() {
- return false;
- }
-
- /**
- * @return true iff f(x, y) = f(y, x) for any x, y AND f(x, f(y, z)) = f(f(x, y), z) for any x, y, z
- */
- public boolean isAssociativeAndCommutative() {
- return isAssociative() && isCommutative();
- }
-
- /**
- * @return true iff f(0, 0) != 0
- */
- public boolean isDensifying() {
- return apply(0.0, 0.0) != 0.0;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/DoubleFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/DoubleFunction.java b/math/src/main/java/org/apache/mahout/math/function/DoubleFunction.java
deleted file mode 100644
index 7545154..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/DoubleFunction.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.function;
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-/**
- * Interface that represents a function object: a function that takes a single argument and returns a single value.
- * @see org.apache.mahout.math.map
- */
-public abstract class DoubleFunction {
-
- /**
- * Apply the function to the argument and return the result
- *
- * @param x double for the argument
- * @return the result of applying the function
- */
- public abstract double apply(double x);
-
- public boolean isDensifying() {
- return Math.abs(apply(0.0)) != 0.0;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/function/FloatFunction.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/function/FloatFunction.java b/math/src/main/java/org/apache/mahout/math/function/FloatFunction.java
deleted file mode 100644
index 94dfe32..0000000
--- a/math/src/main/java/org/apache/mahout/math/function/FloatFunction.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.function;
-
-
-/**
- * Interface that represents a function object: a function that takes a single argument and returns a single value.
- *
- */
-public interface FloatFunction {
-
- /**
- * Applies a function to an argument.
- *
- * @param argument argument passed to the function.
- * @return the result of the function.
- */
- float apply(float argument);
-}

r***@apache.org

2018-06-27 14:51:36 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
deleted file mode 100644
index ee54ad0..0000000
--- a/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
+++ /dev/null
@@ -1,289 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.flavor.TraversingStructureEnum;
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.Functions;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Iterator;
-
-/**
- * sparse matrix with general element values whose rows are accessible quickly. Implemented as a row
- * array of either SequentialAccessSparseVectors or RandomAccessSparseVectors.
- */
-public class SparseRowMatrix extends AbstractMatrix {
- private Vector[] rowVectors;
-
- private final boolean randomAccessRows;
-
- private static final Logger log = LoggerFactory.getLogger(SparseRowMatrix.class);
-
- /**
- * Construct a sparse matrix starting with the provided row vectors.
- *
- * @param rows The number of rows in the result
- * @param columns The number of columns in the result
- * @param rowVectors a Vector[] array of rows
- */
- public SparseRowMatrix(int rows, int columns, Vector[] rowVectors) {
- this(rows, columns, rowVectors, false, rowVectors instanceof RandomAccessSparseVector[]);
- }
-
- public SparseRowMatrix(int rows, int columns, boolean randomAccess) {
- this(rows, columns, randomAccess
- ? new RandomAccessSparseVector[rows]
- : new SequentialAccessSparseVector[rows],
- true,
- randomAccess);
- }
-
- public SparseRowMatrix(int rows, int columns, Vector[] vectors, boolean shallowCopy, boolean randomAccess) {
- super(rows, columns);
- this.randomAccessRows = randomAccess;
- this.rowVectors = vectors.clone();
- for (int row = 0; row < rows; row++) {
- if (vectors[row] == null) {
- // TODO: this can't be right to change the argument
- vectors[row] = randomAccess
- ? new RandomAccessSparseVector(numCols(), 10)
- : new SequentialAccessSparseVector(numCols(), 10);
- }
- this.rowVectors[row] = shallowCopy ? vectors[row] : vectors[row].clone();
- }
- }
-
- /**
- * Construct a matrix of the given cardinality, with rows defaulting to RandomAccessSparseVector
- * implementation
- *
- * @param rows Number of rows in result
- * @param columns Number of columns in result
- */
- public SparseRowMatrix(int rows, int columns) {
- this(rows, columns, true);
- }
-
- @Override
- public Matrix clone() {
- SparseRowMatrix clone = (SparseRowMatrix) super.clone();
- clone.rowVectors = new Vector[rowVectors.length];
- for (int i = 0; i < rowVectors.length; i++) {
- clone.rowVectors[i] = rowVectors[i].clone();
- }
- return clone;
- }
-
- @Override
- public double getQuick(int row, int column) {
- return rowVectors[row] == null ? 0.0 : rowVectors[row].getQuick(column);
- }
-
- @Override
- public Matrix like() {
- return new SparseRowMatrix(rowSize(), columnSize(), randomAccessRows);
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- return new SparseRowMatrix(rows, columns, randomAccessRows);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- rowVectors[row].setQuick(column, value);
- }
-
- @Override
- public int[] getNumNondefaultElements() {
- int[] result = new int[2];
- result[ROW] = rowVectors.length;
- for (int row = 0; row < rowSize(); row++) {
- result[COL] = Math.max(result[COL], rowVectors[row].getNumNondefaultElements());
- }
- return result;
- }
-
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- if (offset[ROW] < 0) {
- throw new IndexException(offset[ROW], rowVectors.length);
- }
- if (offset[ROW] + size[ROW] > rowVectors.length) {
- throw new IndexException(offset[ROW] + size[ROW], rowVectors.length);
- }
- if (offset[COL] < 0) {
- throw new IndexException(offset[COL], rowVectors[ROW].size());
- }
- if (offset[COL] + size[COL] > rowVectors[ROW].size()) {
- throw new IndexException(offset[COL] + size[COL], rowVectors[ROW].size());
- }
- return new MatrixView(this, offset, size);
- }
-
- @Override
- public Matrix assign(Matrix other, DoubleDoubleFunction function) {
- int rows = rowSize();
- if (rows != other.rowSize()) {
- throw new CardinalityException(rows, other.rowSize());
- }
- int columns = columnSize();
- if (columns != other.columnSize()) {
- throw new CardinalityException(columns, other.columnSize());
- }
- for (int row = 0; row < rows; row++) {
- try {
- Iterator<Vector.Element> sparseRowIterator = ((SequentialAccessSparseVector) this.rowVectors[row])
- .iterateNonZero();
- if (function.isLikeMult()) { // TODO: is this a sufficient test?
- // TODO: this may cause an exception if the row type is not compatible but it is currently guaranteed to be
- // a SequentialAccessSparseVector, should "try" here just in case and Warn
- // TODO: can we use iterateNonZero on both rows until the index is the same to get better speedup?
-
- // TODO: SASVs have an iterateNonZero that returns zeros, this should not hurt but is far from optimal
- // this might perform much better if SparseRowMatrix were backed by RandomAccessSparseVectors, which
- // are backed by fastutil hashmaps and the iterateNonZero actually does only return nonZeros.
- while (sparseRowIterator.hasNext()) {
- Vector.Element element = sparseRowIterator.next();
- int col = element.index();
- setQuick(row, col, function.apply(element.get(), other.getQuick(row, col)));
- }
- } else {
- for (int col = 0; col < columns; col++) {
- setQuick(row, col, function.apply(getQuick(row, col), other.getQuick(row, col)));
- }
- }
-
- } catch (ClassCastException e) {
- // Warn and use default implementation
- log.warn("Error casting the row to SequentialAccessSparseVector, this should never happen because" +
- "SparseRomMatrix is always made of SequentialAccessSparseVectors. Proceeding with non-optimzed" +
- "implementation.");
- for (int col = 0; col < columns; col++) {
- setQuick(row, col, function.apply(getQuick(row, col), other.getQuick(row, col)));
- }
- }
- }
- return this;
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- if (rowSize() != other.size()) {
- throw new CardinalityException(rowSize(), other.size());
- }
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- for (int row = 0; row < rowSize(); row++) {
- rowVectors[row].setQuick(column, other.getQuick(row));
- }
- return this;
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- if (columnSize() != other.size()) {
- throw new CardinalityException(columnSize(), other.size());
- }
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- rowVectors[row].assign(other);
- return this;
- }
-
- /**
- * @param row an int row index
- * @return a shallow view of the Vector at specified row (ie you may mutate the original matrix
- * using this row)
- */
- @Override
- public Vector viewRow(int row) {
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- return rowVectors[row];
- }
-
- @Override
- public Matrix transpose() {
- SparseColumnMatrix scm = new SparseColumnMatrix(columns, rows);
- for (int i = 0; i < rows; i++) {
- Vector row = rowVectors[i];
- if (row.getNumNonZeroElements() > 0) {
- scm.assignColumn(i, row);
- }
- }
- return scm;
- }
-
- @Override
- public Matrix times(Matrix other) {
- if (columnSize() != other.rowSize()) {
- throw new CardinalityException(columnSize(), other.rowSize());
- }
-
- if (other instanceof SparseRowMatrix) {
- SparseRowMatrix y = (SparseRowMatrix) other;
- SparseRowMatrix result = (SparseRowMatrix) like(rowSize(), other.columnSize());
-
- for (int i = 0; i < rows; i++) {
- Vector row = rowVectors[i];
- for (Vector.Element element : row.nonZeroes()) {
- result.rowVectors[i].assign(y.rowVectors[element.index()], Functions.plusMult(element.get()));
- }
- }
- return result;
- } else {
- if (other.viewRow(0).isDense()) {
- // result is dense, but can be computed relatively cheaply
- Matrix result = other.like(rowSize(), other.columnSize());
-
- for (int i = 0; i < rows; i++) {
- Vector row = rowVectors[i];
- Vector r = new DenseVector(other.columnSize());
- for (Vector.Element element : row.nonZeroes()) {
- r.assign(other.viewRow(element.index()), Functions.plusMult(element.get()));
- }
- result.viewRow(i).assign(r);
- }
- return result;
- } else {
- // other is sparse, but not something we understand intimately
- SparseRowMatrix result = (SparseRowMatrix) like(rowSize(), other.columnSize());
-
- for (int i = 0; i < rows; i++) {
- Vector row = rowVectors[i];
- for (Vector.Element element : row.nonZeroes()) {
- result.rowVectors[i].assign(other.viewRow(element.index()), Functions.plusMult(element.get()));
- }
- }
- return result;
- }
- }
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- return MatrixFlavor.SPARSELIKE;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Swapper.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Swapper.java b/math/src/main/java/org/apache/mahout/math/Swapper.java
deleted file mode 100644
index 1ca3744..0000000
--- a/math/src/main/java/org/apache/mahout/math/Swapper.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math;
-
-/**
- * Interface for an object that knows how to swap elements at two positions (a,b).
- */
-public interface Swapper {
-
- /** Swaps the generic data g[a] with g[b]. */
- void swap(int a, int b);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java b/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
deleted file mode 100644
index ede6f35..0000000
--- a/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.BackEnum;
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.flavor.TraversingStructureEnum;
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.DoubleFunction;
-
-/**
- * Matrix View backed by an {@link org.apache.mahout.math.function.IntIntFunction}
- */
-public class TransposedMatrixView extends AbstractMatrix {
-
- private Matrix m;
-
- public TransposedMatrixView(Matrix m) {
- super(m.numCols(), m.numRows());
- this.m = m;
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- m.assignRow(column,other);
- return this;
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- m.assignColumn(row,other);
- return this;
- }
-
- @Override
- public double getQuick(int row, int column) {
- return m.getQuick(column,row);
- }
-
- @Override
- public Matrix like() {
- return m.like(rows, columns);
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- return m.like(rows,columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- m.setQuick(column, row, value);
- }
-
- @Override
- public Vector viewRow(int row) {
- return m.viewColumn(row);
- }
-
- @Override
- public Vector viewColumn(int column) {
- return m.viewRow(column);
- }
-
- @Override
- public Matrix assign(double value) {
- return m.assign(value);
- }
-
- @Override
- public Matrix assign(Matrix other, DoubleDoubleFunction function) {
- if (other instanceof TransposedMatrixView) {
- m.assign(((TransposedMatrixView) other).m, function);
- } else {
- m.assign(new TransposedMatrixView(other), function);
- }
- return this;
- }
-
- @Override
- public Matrix assign(Matrix other) {
- if (other instanceof TransposedMatrixView) {
- return m.assign(((TransposedMatrixView) other).m);
- } else {
- return m.assign(new TransposedMatrixView(other));
- }
- }
-
- @Override
- public Matrix assign(DoubleFunction function) {
- return m.assign(function);
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- return flavor;
- }
-
- private MatrixFlavor flavor = new MatrixFlavor() {
- @Override
- public BackEnum getBacking() {
- return m.getFlavor().getBacking();
- }
-
- @Override
- public TraversingStructureEnum getStructure() {
- TraversingStructureEnum flavor = m.getFlavor().getStructure();
- switch (flavor) {
- case COLWISE:
- return TraversingStructureEnum.ROWWISE;
- case SPARSECOLWISE:
- return TraversingStructureEnum.SPARSEROWWISE;
- case ROWWISE:
- return TraversingStructureEnum.COLWISE;
- case SPARSEROWWISE:
- return TraversingStructureEnum.SPARSECOLWISE;
- default:
- return flavor;
- }
- }
-
- @Override
- public boolean isDense() {
- return m.getFlavor().isDense();
- }
- };
-
- Matrix getDelegate() {
- return m;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/UpperTriangular.java b/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
deleted file mode 100644
index 29fa6a0..0000000
--- a/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.BackEnum;
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.flavor.TraversingStructureEnum;
-
-/**
- *
- * Quick and dirty implementation of some {@link org.apache.mahout.math.Matrix} methods
- * over packed upper triangular matrix.
- *
- */
-public class UpperTriangular extends AbstractMatrix {
-
- private static final double EPSILON = 1.0e-12; // assume anything less than
- // that to be 0 during
- // non-upper assignments
-
- private double[] values;
-
- /**
- * represents n x n upper triangular matrix
- *
- * @param n
- */
-
- public UpperTriangular(int n) {
- super(n, n);
- values = new double[n * (n + 1) / 2];
- }
-
- public UpperTriangular(double[] data, boolean shallow) {
- this(elementsToMatrixSize(data != null ? data.length : 0));
- if (data == null) {
- throw new IllegalArgumentException("data");
- }
- values = shallow ? data : data.clone();
- }
-
- public UpperTriangular(Vector data) {
- this(elementsToMatrixSize(data.size()));
-
- for (Vector.Element el:data.nonZeroes()) {
- values[el.index()] = el.get();
- }
- }
-
- private static int elementsToMatrixSize(int dataSize) {
- return (int) Math.round((-1 + Math.sqrt(1 + 8 * dataSize)) / 2);
- }
-
- // copy-constructor
- public UpperTriangular(UpperTriangular mx) {
- this(mx.values, false);
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- if (columnSize() != other.size()) {
- throw new IndexException(columnSize(), other.size());
- }
- if (other.viewPart(column + 1, other.size() - column - 1).norm(1) > 1.0e-14) {
- throw new IllegalArgumentException("Cannot set lower portion of triangular matrix to non-zero");
- }
- for (Vector.Element element : other.viewPart(0, column).all()) {
- setQuick(element.index(), column, element.get());
- }
- return this;
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- if (columnSize() != other.size()) {
- throw new IndexException(numCols(), other.size());
- }
- for (int i = 0; i < row; i++) {
- if (Math.abs(other.getQuick(i)) > EPSILON) {
- throw new IllegalArgumentException("non-triangular source");
- }
- }
- for (int i = row; i < rows; i++) {
- setQuick(row, i, other.get(i));
- }
- return this;
- }
-
- public Matrix assignNonZeroElementsInRow(int row, double[] other) {
- System.arraycopy(other, row, values, getL(row, row), rows - row);
- return this;
- }
-
- @Override
- public double getQuick(int row, int column) {
- if (row > column) {
- return 0;
- }
- int i = getL(row, column);
- return values[i];
- }
-
- private int getL(int row, int col) {
- /*
- * each row starts with some zero elements that we don't store. this
- * accumulates an offset of (row+1)*row/2
- */
- return col + row * numCols() - (row + 1) * row / 2;
- }
-
- @Override
- public Matrix like() {
- return like(rowSize(), columnSize());
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- return new DenseMatrix(rows, columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- values[getL(row, column)] = value;
- }
-
- @Override
- public int[] getNumNondefaultElements() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- return new MatrixView(this, offset, size);
- }
-
- public double[] getData() {
- return values;
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- // We kind of consider ourselves a vector-backed but dense matrix for mmul, etc. purposes.
- return new MatrixFlavor.FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.VECTORBACKED, true);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Vector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Vector.java b/math/src/main/java/org/apache/mahout/math/Vector.java
deleted file mode 100644
index c3b1dc9..0000000
--- a/math/src/main/java/org/apache/mahout/math/Vector.java
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.DoubleFunction;
-
-/**
- * The basic interface including numerous convenience functions NOTE: All implementing classes must have a
- * constructor that takes an int for cardinality and a no-arg constructor that can be used for marshalling the Writable
- * instance NOTE: Implementations may choose to reuse the Vector.Element in the Iterable methods
- */
-public interface Vector extends Cloneable {
-
- /** @return a formatted String suitable for output */
- String asFormatString();
-
- /**
- * Assign the value to all elements of the receiver
- *
- * @param value a double value
- * @return the modified receiver
- */
- Vector assign(double value);
-
- /**
- * Assign the values to the receiver
- *
- * @param values a double[] of values
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Vector assign(double[] values);
-
- /**
- * Assign the other vector values to the receiver
- *
- * @param other a Vector
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Vector assign(Vector other);
-
- /**
- * Apply the function to each element of the receiver
- *
- * @param function a DoubleFunction to apply
- * @return the modified receiver
- */
- Vector assign(DoubleFunction function);
-
- /**
- * Apply the function to each element of the receiver and the corresponding element of the other argument
- *
- * @param other a Vector containing the second arguments to the function
- * @param function a DoubleDoubleFunction to apply
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Vector assign(Vector other, DoubleDoubleFunction function);
-
- /**
- * Apply the function to each element of the receiver, using the y value as the second argument of the
- * DoubleDoubleFunction
- *
- * @param f a DoubleDoubleFunction to be applied
- * @param y a double value to be argument to the function
- * @return the modified receiver
- */
- Vector assign(DoubleDoubleFunction f, double y);
-
- /**
- * Return the cardinality of the recipient (the maximum number of values)
- *
- * @return an int
- */
- int size();
-
- /**
- * true if this implementation should be considered dense -- that it explicitly
- * represents every value
- *
- * @return true or false
- */
- boolean isDense();
-
- /**
- * true if this implementation should be considered to be iterable in index order in an efficient way.
- * In particular this implies that {@link #all()} and {@link #nonZeroes()} ()} return elements
- * in ascending order by index.
- *
- * @return true iff this implementation should be considered to be iterable in index order in an efficient way.
- */
- boolean isSequentialAccess();
-
- /**
- * Return a copy of the recipient
- *
- * @return a new Vector
- */
- @SuppressWarnings("CloneDoesntDeclareCloneNotSupportedException")
- Vector clone();
-
- Iterable<Element> all();
-
- Iterable<Element> nonZeroes();
-
- /**
- * Return an object of Vector.Element representing an element of this Vector. Useful when designing new iterator
- * types.
- *
- * @param index Index of the Vector.Element required
- * @return The Vector.Element Object
- */
- Element getElement(int index);
-
- /**
- * Merge a set of (index, value) pairs into the vector.
- * @param updates an ordered mapping of indices to values to be merged in.
- */
- void mergeUpdates(OrderedIntDoubleMapping updates);
-
- /**
- * A holder for information about a specific item in the Vector. 
- * When using with an Iterator, the implementation
- * may choose to reuse this element, so you may need to make a copy if you want to keep it
- */
- interface Element {
-
- /** @return the value of this vector element. */
- double get();
-
- /** @return the index of this vector element. */
- int index();
-
- /** @param value Set the current element to value. */
- void set(double value);
- }
-
- /**
- * Return a new vector containing the values of the recipient divided by the argument
- *
- * @param x a double value
- * @return a new Vector
- */
- Vector divide(double x);
-
- /**
- * Return the dot product of the recipient and the argument
- *
- * @param x a Vector
- * @return a new Vector
- * @throws CardinalityException if the cardinalities differ
- */
- double dot(Vector x);
-
- /**
- * Return the value at the given index
- *
- * @param index an int index
- * @return the double at the index
- * @throws IndexException if the index is out of bounds
- */
- double get(int index);
-
- /**
- * Return the value at the given index, without checking bounds
- *
- * @param index an int index
- * @return the double at the index
- */
- double getQuick(int index);
-
- /**
- * Return an empty vector of the same underlying class as the receiver
- *
- * @return a Vector
- */
- Vector like();
-
- /**
- * Return a new empty vector of the same underlying class as the receiver with given cardinality
- *
- * @param cardinality - size of vector
- * @return {@link Vector}
- */
- Vector like(int cardinality);
-
- /**
- * Return a new vector containing the element by element difference of the recipient and the argument
- *
- * @param x a Vector
- * @return a new Vector
- * @throws CardinalityException if the cardinalities differ
- */
- Vector minus(Vector x);
-
- /**
- * Return a new vector containing the normalized (L_2 norm) values of the recipient
- *
- * @return a new Vector
- */
- Vector normalize();
-
- /**
- * Return a new Vector containing the normalized (L_power norm) values of the recipient. 
- * See
- * http://en.wikipedia.org/wiki/Lp_space 
- * Technically, when {@code 0 < power < 1}, we don't have a norm, just a metric,
- * but we'll overload this here. 
- * Also supports {@code power == 0} (number of non-zero elements) and power = {@link
- * Double#POSITIVE_INFINITY} (max element). Again, see the Wikipedia page for more info
- *
- * @param power The power to use. Must be >= 0. May also be {@link Double#POSITIVE_INFINITY}. See the Wikipedia link
- * for more on this.
- * @return a new Vector x such that norm(x, power) == 1
- */
- Vector normalize(double power);
-
- /**
- * Return a new vector containing the log(1 + entry)/ L_2 norm values of the recipient
- *
- * @return a new Vector
- */
- Vector logNormalize();
-
- /**
- * Return a new Vector with a normalized value calculated as log_power(1 + entry)/ L_power norm. 
- *
- * @param power The power to use. Must be > 1. Cannot be {@link Double#POSITIVE_INFINITY}.
- * @return a new Vector
- */
- Vector logNormalize(double power);
-
- /**
- * Return the k-norm of the vector. See http://en.wikipedia.org/wiki/Lp_space 
- * Technically, when {@code 0 > power < 1}, we don't have a norm, just a metric, but we'll overload this here. Also supports power == 0 (number of
- * non-zero elements) and power = {@link Double#POSITIVE_INFINITY} (max element). Again, see the Wikipedia page for
- * more info.
- *
- * @param power The power to use.
- * @see #normalize(double)
- */
- double norm(double power);
-
- /** @return The minimum value in the Vector */
- double minValue();
-
- /** @return The index of the minimum value */
- int minValueIndex();
-
- /** @return The maximum value in the Vector */
- double maxValue();
-
- /** @return The index of the maximum value */
- int maxValueIndex();
-
- /**
- * Return a new vector containing the sum of each value of the recipient and the argument
- *
- * @param x a double
- * @return a new Vector
- */
- Vector plus(double x);
-
- /**
- * Return a new vector containing the element by element sum of the recipient and the argument
- *
- * @param x a Vector
- * @return a new Vector
- * @throws CardinalityException if the cardinalities differ
- */
- Vector plus(Vector x);
-
- /**
- * Set the value at the given index
- *
- * @param index an int index into the receiver
- * @param value a double value to set
- * @throws IndexException if the index is out of bounds
- */
- void set(int index, double value);
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param index an int index into the receiver
- * @param value a double value to set
- */
- void setQuick(int index, double value);
-
- /**
- * Increment the value at the given index by the given value.
- *
- * @param index an int index into the receiver
- * @param increment sets the value at the given index to value + increment;
- */
- void incrementQuick(int index, double increment);
-
- /**
- * Return the number of values in the recipient which are not the default value. For instance, for a
- * sparse vector, this would be the number of non-zero values.
- *
- * @return an int
- */
- int getNumNondefaultElements();
-
- /**
- * Return the number of non zero elements in the vector.
- *
- * @return an int
- */
- int getNumNonZeroElements();
-
- /**
- * Return a new vector containing the product of each value of the recipient and the argument
- *
- * @param x a double argument
- * @return a new Vector
- */
- Vector times(double x);
-
- /**
- * Return a new vector containing the element-wise product of the recipient and the argument
- *
- * @param x a Vector argument
- * @return a new Vector
- * @throws CardinalityException if the cardinalities differ
- */
- Vector times(Vector x);
-
- /**
- * Return a new vector containing the subset of the recipient
- *
- * @param offset an int offset into the receiver
- * @param length the cardinality of the desired result
- * @return a new Vector
- * @throws CardinalityException if the length is greater than the cardinality of the receiver
- * @throws IndexException if the offset is negative or the offset+length is outside of the receiver
- */
- Vector viewPart(int offset, int length);
-
- /**
- * Return the sum of all the elements of the receiver
- *
- * @return a double
- */
- double zSum();
-
- /**
- * Return the cross product of the receiver and the other vector
- *
- * @param other another Vector
- * @return a Matrix
- */
- Matrix cross(Vector other);
-
- /*
- * Need stories for these but keeping them here for now.
- */
- // void getNonZeros(IntArrayList jx, DoubleArrayList values);
- // void foreachNonZero(IntDoubleFunction f);
- // DoubleDoubleFunction map);
- // NewVector assign(Vector y, DoubleDoubleFunction function, IntArrayList
- // nonZeroIndexes);
-
- /**
- * Examples speak louder than words: aggregate(plus, pow(2)) is another way to say
- * getLengthSquared(), aggregate(max, abs) is norm(Double.POSITIVE_INFINITY). To sum all of the positive values,
- * aggregate(plus, max(0)).
- * @param aggregator used to combine the current value of the aggregation with the result of map.apply(nextValue)
- * @param map a function to apply to each element of the vector in turn before passing to the aggregator
- * @return the final aggregation
- */
- double aggregate(DoubleDoubleFunction aggregator, DoubleFunction map);
-
- /**
- * Generalized inner product - take two vectors, iterate over them both, using the combiner to combine together
- * (and possibly map in some way) each pair of values, which are then aggregated with the previous accumulated
- * value in the combiner.
- * 
- * Example: dot(other) could be expressed as aggregate(other, Plus, Times), and kernelized inner products (which
- * are symmetric on the indices) work similarly.
- * @param other a vector to aggregate in combination with
- * @param aggregator function we're aggregating with; fa
- * @param combiner function we're combining with; fc
- * @return the final aggregation; {@code if r0 = fc(this[0], other[0]), ri = fa(r_{i-1}, fc(this[i], other[i]))
- * for all i > 0}
- */
- double aggregate(Vector other, DoubleDoubleFunction aggregator, DoubleDoubleFunction combiner);
-
- /**
- * Return the sum of squares of all elements in the vector. Square root of
- * this value is the length of the vector.
- */
- double getLengthSquared();
-
- /**
- * Get the square of the distance between this vector and the other vector.
- */
- double getDistanceSquared(Vector v);
-
- /**
- * Gets an estimate of the cost (in number of operations) it takes to lookup a random element in this vector.
- */
- double getLookupCost();
-
- /**
- * Gets an estimate of the cost (in number of operations) it takes to advance an iterator through the nonzero
- * elements of this vector.
- */
- double getIteratorAdvanceCost();
-
- /**
- * Return true iff adding a new (nonzero) element takes constant time for this vector.
- */
- boolean isAddConstantTime();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/VectorBinaryAggregate.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/VectorBinaryAggregate.java b/math/src/main/java/org/apache/mahout/math/VectorBinaryAggregate.java
deleted file mode 100644
index 4d3a80f..0000000
--- a/math/src/main/java/org/apache/mahout/math/VectorBinaryAggregate.java
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.set.OpenIntHashSet;
-
-import java.util.Iterator;
-
-/**
- * Abstract class encapsulating different algorithms that perform the Vector operations aggregate().
- * x.aggregte(y, fa, fc), for x and y Vectors and fa, fc DoubleDouble functions:
- * - applies the function fc to every element in x and y, fc(xi, yi)
- * - constructs a result iteratively, r0 = fc(x0, y0), ri = fc(r_{i-1}, fc(xi, yi)).
- * This works essentially like a map/reduce functional combo.
- *
- * The names of variables, methods and classes used here follow the following conventions:
- * The vector being assigned to (the left hand side) is called this or x.
- * The right hand side is called that or y.
- * The aggregating (reducing) function to be applied is called fa.
- * The combining (mapping) function to be applied is called fc.
- *
- * The different algorithms take into account the different characteristics of vector classes:
- * - whether the vectors support sequential iteration (isSequential())
- * - what the lookup cost is (getLookupCost())
- * - what the iterator advancement cost is (getIteratorAdvanceCost())
- *
- * The names of the actual classes (they're nested in VectorBinaryAssign) describe the used for assignment.
- * The most important optimization is iterating just through the nonzeros (only possible if f(0, 0) = 0).
- * There are 4 main possibilities:
- * - iterating through the nonzeros of just one vector and looking up the corresponding elements in the other
- * - iterating through the intersection of nonzeros (those indices where both vectors have nonzero values)
- * - iterating through the union of nonzeros (those indices where at least one of the vectors has a nonzero value)
- * - iterating through all the elements in some way (either through both at the same time, both one after the other,
- * looking up both, looking up just one).
- *
- * The internal details are not important and a particular algorithm should generally not be called explicitly.
- * The best one will be selected through assignBest(), which is itself called through Vector.assign().
- *
- * See https://docs.google.com/document/d/1g1PjUuvjyh2LBdq2_rKLIcUiDbeOORA1sCJiSsz-JVU/edit# for a more detailed
- * explanation.
- */
-public abstract class VectorBinaryAggregate {
- public static final VectorBinaryAggregate[] OPERATIONS = {
- new AggregateNonzerosIterateThisLookupThat(),
- new AggregateNonzerosIterateThatLookupThis(),
-
- new AggregateIterateIntersection(),
-
- new AggregateIterateUnionSequential(),
- new AggregateIterateUnionRandom(),
-
- new AggregateAllIterateSequential(),
- new AggregateAllIterateThisLookupThat(),
- new AggregateAllIterateThatLookupThis(),
- new AggregateAllLoop(),
- };
-
- /**
- * Returns true iff we can use this algorithm to apply fc to x and y component-wise and aggregate the result using fa.
- */
- public abstract boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc);
-
- /**
- * Estimates the cost of using this algorithm to compute the aggregation. The algorithm is assumed to be valid.
- */
- public abstract double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc);
-
- /**
- * Main method that applies fc to x and y component-wise aggregating the results with fa. It returns the result of
- * the aggregation.
- */
- public abstract double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc);
-
- /**
- * The best operation is the least expensive valid one.
- */
- public static VectorBinaryAggregate getBestOperation(Vector x, Vector y, DoubleDoubleFunction fa,
- DoubleDoubleFunction fc) {
- int bestOperationIndex = -1;
- double bestCost = Double.POSITIVE_INFINITY;
- for (int i = 0; i < OPERATIONS.length; ++i) {
- if (OPERATIONS[i].isValid(x, y, fa, fc)) {
- double cost = OPERATIONS[i].estimateCost(x, y, fa, fc);
- if (cost < bestCost) {
- bestCost = cost;
- bestOperationIndex = i;
- }
- }
- }
- return OPERATIONS[bestOperationIndex];
- }
-
- /**
- * This is the method that should be used when aggregating. It selects the best algorithm and applies it.
- */
- public static double aggregateBest(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return getBestOperation(x, y, fa, fc).aggregate(x, y, fa, fc);
- }
-
- public static class AggregateNonzerosIterateThisLookupThat extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return fa.isLikeRightPlus() && (fa.isAssociativeAndCommutative() || x.isSequentialAccess())
- && fc.isLikeLeftMult();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return x.getNumNondefaultElements() * x.getIteratorAdvanceCost() * y.getLookupCost();
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- Iterator<Vector.Element> xi = x.nonZeroes().iterator();
- if (!xi.hasNext()) {
- return 0;
- }
- Vector.Element xe = xi.next();
- double result = fc.apply(xe.get(), y.getQuick(xe.index()));
- while (xi.hasNext()) {
- xe = xi.next();
- result = fa.apply(result, fc.apply(xe.get(), y.getQuick(xe.index())));
- }
- return result;
- }
- }
-
- public static class AggregateNonzerosIterateThatLookupThis extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return fa.isLikeRightPlus() && (fa.isAssociativeAndCommutative() || y.isSequentialAccess())
- && fc.isLikeRightMult();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return y.getNumNondefaultElements() * y.getIteratorAdvanceCost() * x.getLookupCost() * x.getLookupCost();
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- Iterator<Vector.Element> yi = y.nonZeroes().iterator();
- if (!yi.hasNext()) {
- return 0;
- }
- Vector.Element ye = yi.next();
- double result = fc.apply(x.getQuick(ye.index()), ye.get());
- while (yi.hasNext()) {
- ye = yi.next();
- result = fa.apply(result, fc.apply(x.getQuick(ye.index()), ye.get()));
- }
- return result;
- }
- }
-
- public static class AggregateIterateIntersection extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return fa.isLikeRightPlus() && fc.isLikeMult() && x.isSequentialAccess() && y.isSequentialAccess();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return Math.min(x.getNumNondefaultElements() * x.getIteratorAdvanceCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- Iterator<Vector.Element> xi = x.nonZeroes().iterator();
- Iterator<Vector.Element> yi = y.nonZeroes().iterator();
- Vector.Element xe = null;
- Vector.Element ye = null;
- boolean advanceThis = true;
- boolean advanceThat = true;
- boolean validResult = false;
- double result = 0;
- while (true) {
- if (advanceThis) {
- if (xi.hasNext()) {
- xe = xi.next();
- } else {
- break;
- }
- }
- if (advanceThat) {
- if (yi.hasNext()) {
- ye = yi.next();
- } else {
- break;
- }
- }
- if (xe.index() == ye.index()) {
- double thisResult = fc.apply(xe.get(), ye.get());
- if (validResult) {
- result = fa.apply(result, thisResult);
- } else {
- result = thisResult;
- validResult = true;
- }
- advanceThis = true;
- advanceThat = true;
- } else {
- if (xe.index() < ye.index()) { // f(x, 0) = 0
- advanceThis = true;
- advanceThat = false;
- } else { // f(0, y) = 0
- advanceThis = false;
- advanceThat = true;
- }
- }
- }
- return result;
- }
- }
-
- public static class AggregateIterateUnionSequential extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return fa.isLikeRightPlus() && !fc.isDensifying()
- && x.isSequentialAccess() && y.isSequentialAccess();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return Math.max(x.getNumNondefaultElements() * x.getIteratorAdvanceCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- Iterator<Vector.Element> xi = x.nonZeroes().iterator();
- Iterator<Vector.Element> yi = y.nonZeroes().iterator();
- Vector.Element xe = null;
- Vector.Element ye = null;
- boolean advanceThis = true;
- boolean advanceThat = true;
- boolean validResult = false;
- double result = 0;
- while (true) {
- if (advanceThis) {
- if (xi.hasNext()) {
- xe = xi.next();
- } else {
- xe = null;
- }
- }
- if (advanceThat) {
- if (yi.hasNext()) {
- ye = yi.next();
- } else {
- ye = null;
- }
- }
- double thisResult;
- if (xe != null && ye != null) { // both vectors have nonzero elements
- if (xe.index() == ye.index()) {
- thisResult = fc.apply(xe.get(), ye.get());
- advanceThis = true;
- advanceThat = true;
- } else {
- if (xe.index() < ye.index()) { // f(x, 0)
- thisResult = fc.apply(xe.get(), 0);
- advanceThis = true;
- advanceThat = false;
- } else {
- thisResult = fc.apply(0, ye.get());
- advanceThis = false;
- advanceThat = true;
- }
- }
- } else if (xe != null) { // just the first one still has nonzeros
- thisResult = fc.apply(xe.get(), 0);
- advanceThis = true;
- advanceThat = false;
- } else if (ye != null) { // just the second one has nonzeros
- thisResult = fc.apply(0, ye.get());
- advanceThis = false;
- advanceThat = true;
- } else { // we're done, both are empty
- break;
- }
- if (validResult) {
- result = fa.apply(result, thisResult);
- } else {
- result = thisResult;
- validResult = true;
- }
- }
- return result;
- }
- }
-
- public static class AggregateIterateUnionRandom extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return fa.isLikeRightPlus() && !fc.isDensifying()
- && (fa.isAssociativeAndCommutative() || (x.isSequentialAccess() && y.isSequentialAccess()));
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return Math.max(x.getNumNondefaultElements() * x.getIteratorAdvanceCost() * y.getLookupCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost() * x.getLookupCost());
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- OpenIntHashSet visited = new OpenIntHashSet();
- Iterator<Vector.Element> xi = x.nonZeroes().iterator();
- boolean validResult = false;
- double result = 0;
- double thisResult;
- while (xi.hasNext()) {
- Vector.Element xe = xi.next();
- thisResult = fc.apply(xe.get(), y.getQuick(xe.index()));
- if (validResult) {
- result = fa.apply(result, thisResult);
- } else {
- result = thisResult;
- validResult = true;
- }
- visited.add(xe.index());
- }
- Iterator<Vector.Element> yi = y.nonZeroes().iterator();
- while (yi.hasNext()) {
- Vector.Element ye = yi.next();
- if (!visited.contains(ye.index())) {
- thisResult = fc.apply(x.getQuick(ye.index()), ye.get());
- if (validResult) {
- result = fa.apply(result, thisResult);
- } else {
- result = thisResult;
- validResult = true;
- }
- }
- }
- return result;
- }
- }
-
- public static class AggregateAllIterateSequential extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return x.isSequentialAccess() && y.isSequentialAccess() && !x.isDense() && !y.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return Math.max(x.size() * x.getIteratorAdvanceCost(), y.size() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- Iterator<Vector.Element> xi = x.all().iterator();
- Iterator<Vector.Element> yi = y.all().iterator();
- boolean validResult = false;
- double result = 0;
- while (xi.hasNext() && yi.hasNext()) {
- Vector.Element xe = xi.next();
- double thisResult = fc.apply(xe.get(), yi.next().get());
- if (validResult) {
- result = fa.apply(result, thisResult);
- } else {
- result = thisResult;
- validResult = true;
- }
- }
- return result;
- }
- }
-
- public static class AggregateAllIterateThisLookupThat extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return (fa.isAssociativeAndCommutative() || x.isSequentialAccess())
- && !x.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return x.size() * x.getIteratorAdvanceCost() * y.getLookupCost();
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- Iterator<Vector.Element> xi = x.all().iterator();
- boolean validResult = false;
- double result = 0;
- while (xi.hasNext()) {
- Vector.Element xe = xi.next();
- double thisResult = fc.apply(xe.get(), y.getQuick(xe.index()));
- if (validResult) {
- result = fa.apply(result, thisResult);
- } else {
- result = thisResult;
- validResult = true;
- }
- }
- return result;
- }
- }
-
- public static class AggregateAllIterateThatLookupThis extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return (fa.isAssociativeAndCommutative() || y.isSequentialAccess())
- && !y.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return y.size() * y.getIteratorAdvanceCost() * x.getLookupCost();
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- Iterator<Vector.Element> yi = y.all().iterator();
- boolean validResult = false;
- double result = 0;
- while (yi.hasNext()) {
- Vector.Element ye = yi.next();
- double thisResult = fc.apply(x.getQuick(ye.index()), ye.get());
- if (validResult) {
- result = fa.apply(result, thisResult);
- } else {
- result = thisResult;
- validResult = true;
- }
- }
- return result;
- }
- }
-
- public static class AggregateAllLoop extends VectorBinaryAggregate {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return true;
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- return x.size() * x.getLookupCost() * y.getLookupCost();
- }
-
- @Override
- public double aggregate(Vector x, Vector y, DoubleDoubleFunction fa, DoubleDoubleFunction fc) {
- double result = fc.apply(x.getQuick(0), y.getQuick(0));
- int s = x.size();
- for (int i = 1; i < s; ++i) {
- result = fa.apply(result, fc.apply(x.getQuick(i), y.getQuick(i)));
- }
- return result;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/VectorBinaryAssign.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/VectorBinaryAssign.java b/math/src/main/java/org/apache/mahout/math/VectorBinaryAssign.java
deleted file mode 100644
index f24d552..0000000
--- a/math/src/main/java/org/apache/mahout/math/VectorBinaryAssign.java
+++ /dev/null
@@ -1,667 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.Vector.Element;
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.set.OpenIntHashSet;
-
-import java.util.Iterator;
-
-/**
- * Abstract class encapsulating different algorithms that perform the Vector operations assign().
- * x.assign(y, f), for x and y Vectors and f a DoubleDouble function:
- * - applies the function f to every element in x and y, f(xi, yi)
- * - assigns xi = f(xi, yi) for all indices i
- *
- * The names of variables, methods and classes used here follow the following conventions:
- * The vector being assigned to (the left hand side) is called this or x.
- * The right hand side is called that or y.
- * The function to be applied is called f.
- *
- * The different algorithms take into account the different characteristics of vector classes:
- * - whether the vectors support sequential iteration (isSequential())
- * - whether the vectors support constant-time additions (isAddConstantTime())
- * - what the lookup cost is (getLookupCost())
- * - what the iterator advancement cost is (getIteratorAdvanceCost())
- *
- * The names of the actual classes (they're nested in VectorBinaryAssign) describe the used for assignment.
- * The most important optimization is iterating just through the nonzeros (only possible if f(0, 0) = 0).
- * There are 4 main possibilities:
- * - iterating through the nonzeros of just one vector and looking up the corresponding elements in the other
- * - iterating through the intersection of nonzeros (those indices where both vectors have nonzero values)
- * - iterating through the union of nonzeros (those indices where at least one of the vectors has a nonzero value)
- * - iterating through all the elements in some way (either through both at the same time, both one after the other,
- * looking up both, looking up just one).
- * Then, there are two additional sub-possibilities:
- * - if a new value can be added to x in constant time (isAddConstantTime()), the *Inplace updates are used
- * - otherwise (really just for SequentialAccessSparseVectors right now), the *Merge updates are used, where
- * a sorted list of (index, value) pairs is merged into the vector at the end.
- *
- * The internal details are not important and a particular algorithm should generally not be called explicitly.
- * The best one will be selected through assignBest(), which is itself called through Vector.assign().
- *
- * See https://docs.google.com/document/d/1g1PjUuvjyh2LBdq2_rKLIcUiDbeOORA1sCJiSsz-JVU/edit# for a more detailed
- * explanation.
- */
-public abstract class VectorBinaryAssign {
- public static final VectorBinaryAssign[] OPERATIONS = {
- new AssignNonzerosIterateThisLookupThat(),
- new AssignNonzerosIterateThatLookupThisMergeUpdates(),
- new AssignNonzerosIterateThatLookupThisInplaceUpdates(),
-
- new AssignIterateIntersection(),
-
- new AssignIterateUnionSequentialMergeUpdates(),
- new AssignIterateUnionSequentialInplaceUpdates(),
- new AssignIterateUnionRandomMergeUpdates(),
- new AssignIterateUnionRandomInplaceUpdates(),
-
- new AssignAllIterateSequentialMergeUpdates(),
- new AssignAllIterateSequentialInplaceUpdates(),
- new AssignAllIterateThisLookupThatMergeUpdates(),
- new AssignAllIterateThisLookupThatInplaceUpdates(),
- new AssignAllIterateThatLookupThisMergeUpdates(),
- new AssignAllIterateThatLookupThisInplaceUpdates(),
- new AssignAllLoopMergeUpdates(),
- new AssignAllLoopInplaceUpdates(),
- };
-
- /**
- * Returns true iff we can use this algorithm to apply f to x and y component-wise and assign the result to x.
- */
- public abstract boolean isValid(Vector x, Vector y, DoubleDoubleFunction f);
-
- /**
- * Estimates the cost of using this algorithm to compute the assignment. The algorithm is assumed to be valid.
- */
- public abstract double estimateCost(Vector x, Vector y, DoubleDoubleFunction f);
-
- /**
- * Main method that applies f to x and y component-wise assigning the results to x. It returns the modified vector,
- * x.
- */
- public abstract Vector assign(Vector x, Vector y, DoubleDoubleFunction f);
-
- /**
- * The best operation is the least expensive valid one.
- */
- public static VectorBinaryAssign getBestOperation(Vector x, Vector y, DoubleDoubleFunction f) {
- int bestOperationIndex = -1;
- double bestCost = Double.POSITIVE_INFINITY;
- for (int i = 0; i < OPERATIONS.length; ++i) {
- if (OPERATIONS[i].isValid(x, y, f)) {
- double cost = OPERATIONS[i].estimateCost(x, y, f);
- if (cost < bestCost) {
- bestCost = cost;
- bestOperationIndex = i;
- }
- }
- }
- return OPERATIONS[bestOperationIndex];
- }
-
- /**
- * This is the method that should be used when assigning. It selects the best algorithm and applies it.
- * Note that it does NOT invalidate the cached length of the Vector and should only be used through the wrapprs
- * in AbstractVector.
- */
- public static Vector assignBest(Vector x, Vector y, DoubleDoubleFunction f) {
- return getBestOperation(x, y, f).assign(x, y, f);
- }
-
- /**
- * If f(0, y) = 0, the zeros in x don't matter and we can simply iterate through the nonzeros of x.
- * To get the corresponding element of y, we perform a lookup.
- * There are no *Merge or *Inplace versions because in this case x cannot become more dense because of f, meaning
- * all changes will occur at indices whose values are already nonzero.
- */
- public static class AssignNonzerosIterateThisLookupThat extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return f.isLikeLeftMult();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.getNumNondefaultElements() * x.getIteratorAdvanceCost() * y.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- for (Element xe : x.nonZeroes()) {
- xe.set(f.apply(xe.get(), y.getQuick(xe.index())));
- }
- return x;
- }
- }
-
- /**
- * If f(x, 0) = x, the zeros in y don't matter and we can simply iterate through the nonzeros of y.
- * We get the corresponding element of x through a lookup and update x inplace.
- */
- public static class AssignNonzerosIterateThatLookupThisInplaceUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return f.isLikeRightPlus();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return y.getNumNondefaultElements() * y.getIteratorAdvanceCost() * x.getLookupCost() * x.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- for (Element ye : y.nonZeroes()) {
- x.setQuick(ye.index(), f.apply(x.getQuick(ye.index()), ye.get()));
- }
- return x;
- }
- }
-
- /**
- * If f(x, 0) = x, the zeros in y don't matter and we can simply iterate through the nonzeros of y.
- * We get the corresponding element of x through a lookup and update x by merging.
- */
- public static class AssignNonzerosIterateThatLookupThisMergeUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return f.isLikeRightPlus() && y.isSequentialAccess() && !x.isAddConstantTime();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return y.getNumNondefaultElements() * y.getIteratorAdvanceCost() * y.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping(false);
- for (Element ye : y.nonZeroes()) {
- updates.set(ye.index(), f.apply(x.getQuick(ye.index()), ye.get()));
- }
- x.mergeUpdates(updates);
- return x;
- }
- }
-
- /**
- * If f(x, 0) = x and f(0, y) = 0 the zeros in x and y don't matter and we can iterate through the nonzeros
- * in both x and y.
- * This is only possible if both x and y support sequential access.
- */
- public static class AssignIterateIntersection extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return f.isLikeLeftMult() && f.isLikeRightPlus() && x.isSequentialAccess() && y.isSequentialAccess();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return Math.min(x.getNumNondefaultElements() * x.getIteratorAdvanceCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- Iterator<Vector.Element> xi = x.nonZeroes().iterator();
- Iterator<Vector.Element> yi = y.nonZeroes().iterator();
- Vector.Element xe = null;
- Vector.Element ye = null;
- boolean advanceThis = true;
- boolean advanceThat = true;
- while (true) {
- if (advanceThis) {
- if (xi.hasNext()) {
- xe = xi.next();
- } else {
- break;
- }
- }
- if (advanceThat) {
- if (yi.hasNext()) {
- ye = yi.next();
- } else {
- break;
- }
- }
- if (xe.index() == ye.index()) {
- xe.set(f.apply(xe.get(), ye.get()));
- advanceThis = true;
- advanceThat = true;
- } else {
- if (xe.index() < ye.index()) { // f(x, 0) = 0
- advanceThis = true;
- advanceThat = false;
- } else { // f(0, y) = 0
- advanceThis = false;
- advanceThat = true;
- }
- }
- }
- return x;
- }
- }
-
- /**
- * If f(0, 0) = 0 we can iterate through the nonzeros in either x or y.
- * In this case we iterate through them in parallel and update x by merging. Because we're iterating through
- * both vectors at the same time, x and y need to support sequential access.
- */
- public static class AssignIterateUnionSequentialMergeUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return !f.isDensifying() && x.isSequentialAccess() && y.isSequentialAccess() && !x.isAddConstantTime();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return Math.max(x.getNumNondefaultElements() * x.getIteratorAdvanceCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- Iterator<Vector.Element> xi = x.nonZeroes().iterator();
- Iterator<Vector.Element> yi = y.nonZeroes().iterator();
- Vector.Element xe = null;
- Vector.Element ye = null;
- boolean advanceThis = true;
- boolean advanceThat = true;
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping(false);
- while (true) {
- if (advanceThis) {
- if (xi.hasNext()) {
- xe = xi.next();
- } else {
- xe = null;
- }
- }
- if (advanceThat) {
- if (yi.hasNext()) {
- ye = yi.next();
- } else {
- ye = null;
- }
- }
- if (xe != null && ye != null) { // both vectors have nonzero elements
- if (xe.index() == ye.index()) {
- xe.set(f.apply(xe.get(), ye.get()));
- advanceThis = true;
- advanceThat = true;
- } else {
- if (xe.index() < ye.index()) { // f(x, 0)
- xe.set(f.apply(xe.get(), 0));
- advanceThis = true;
- advanceThat = false;
- } else {
- updates.set(ye.index(), f.apply(0, ye.get()));
- advanceThis = false;
- advanceThat = true;
- }
- }
- } else if (xe != null) { // just the first one still has nonzeros
- xe.set(f.apply(xe.get(), 0));
- advanceThis = true;
- advanceThat = false;
- } else if (ye != null) { // just the second one has nonzeros
- updates.set(ye.index(), f.apply(0, ye.get()));
- advanceThis = false;
- advanceThat = true;
- } else { // we're done, both are empty
- break;
- }
- }
- x.mergeUpdates(updates);
- return x;
- }
- }
-
- /**
- * If f(0, 0) = 0 we can iterate through the nonzeros in either x or y.
- * In this case we iterate through them in parallel and update x inplace. Because we're iterating through
- * both vectors at the same time, x and y need to support sequential access.
- */
- public static class AssignIterateUnionSequentialInplaceUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return !f.isDensifying() && x.isSequentialAccess() && y.isSequentialAccess() && x.isAddConstantTime();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return Math.max(x.getNumNondefaultElements() * x.getIteratorAdvanceCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- Iterator<Vector.Element> xi = x.nonZeroes().iterator();
- Iterator<Vector.Element> yi = y.nonZeroes().iterator();
- Vector.Element xe = null;
- Vector.Element ye = null;
- boolean advanceThis = true;
- boolean advanceThat = true;
- while (true) {
- if (advanceThis) {
- if (xi.hasNext()) {
- xe = xi.next();
- } else {
- xe = null;
- }
- }
- if (advanceThat) {
- if (yi.hasNext()) {
- ye = yi.next();
- } else {
- ye = null;
- }
- }
- if (xe != null && ye != null) { // both vectors have nonzero elements
- if (xe.index() == ye.index()) {
- xe.set(f.apply(xe.get(), ye.get()));
- advanceThis = true;
- advanceThat = true;
- } else {
- if (xe.index() < ye.index()) { // f(x, 0)
- xe.set(f.apply(xe.get(), 0));
- advanceThis = true;
- advanceThat = false;
- } else {
- x.setQuick(ye.index(), f.apply(0, ye.get()));
- advanceThis = false;
- advanceThat = true;
- }
- }
- } else if (xe != null) { // just the first one still has nonzeros
- xe.set(f.apply(xe.get(), 0));
- advanceThis = true;
- advanceThat = false;
- } else if (ye != null) { // just the second one has nonzeros
- x.setQuick(ye.index(), f.apply(0, ye.get()));
- advanceThis = false;
- advanceThat = true;
- } else { // we're done, both are empty
- break;
- }
- }
- return x;
- }
- }
-
- /**
- * If f(0, 0) = 0 we can iterate through the nonzeros in either x or y.
- * In this case, we iterate through the nozeros of x and y alternatively (this works even when one of them
- * doesn't support sequential access). Since we're merging the results into x, when iterating through y, the
- * order of iteration matters and y must support sequential access.
- */
- public static class AssignIterateUnionRandomMergeUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return !f.isDensifying() && !x.isAddConstantTime() && y.isSequentialAccess();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return Math.max(x.getNumNondefaultElements() * x.getIteratorAdvanceCost() * y.getLookupCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost() * x.getLookupCost());
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- OpenIntHashSet visited = new OpenIntHashSet();
- for (Element xe : x.nonZeroes()) {
- xe.set(f.apply(xe.get(), y.getQuick(xe.index())));
- visited.add(xe.index());
- }
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping(false);
- for (Element ye : y.nonZeroes()) {
- if (!visited.contains(ye.index())) {
- updates.set(ye.index(), f.apply(x.getQuick(ye.index()), ye.get()));
- }
- }
- x.mergeUpdates(updates);
- return x;
- }
- }
-
- /**
- * If f(0, 0) = 0 we can iterate through the nonzeros in either x or y.
- * In this case, we iterate through the nozeros of x and y alternatively (this works even when one of them
- * doesn't support sequential access). Because updates to x are inplace, neither x, nor y need to support
- * sequential access.
- */
- public static class AssignIterateUnionRandomInplaceUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return !f.isDensifying() && x.isAddConstantTime();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return Math.max(x.getNumNondefaultElements() * x.getIteratorAdvanceCost() * y.getLookupCost(),
- y.getNumNondefaultElements() * y.getIteratorAdvanceCost() * x.getLookupCost());
- }
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- OpenIntHashSet visited = new OpenIntHashSet();
- for (Element xe : x.nonZeroes()) {
- xe.set(f.apply(xe.get(), y.getQuick(xe.index())));
- visited.add(xe.index());
- }
- for (Element ye : y.nonZeroes()) {
- if (!visited.contains(ye.index())) {
- x.setQuick(ye.index(), f.apply(x.getQuick(ye.index()), ye.get()));
- }
- }
- return x;
- }
- }
-
- public static class AssignAllIterateSequentialMergeUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.isSequentialAccess() && y.isSequentialAccess() && !x.isAddConstantTime() && !x.isDense() && !y.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return Math.max(x.size() * x.getIteratorAdvanceCost(), y.size() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- Iterator<Vector.Element> xi = x.all().iterator();
- Iterator<Vector.Element> yi = y.all().iterator();
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping(false);
- while (xi.hasNext() && yi.hasNext()) {
- Element xe = xi.next();
- updates.set(xe.index(), f.apply(xe.get(), yi.next().get()));
- }
- x.mergeUpdates(updates);
- return x;
- }
- }
-
- public static class AssignAllIterateSequentialInplaceUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.isSequentialAccess() && y.isSequentialAccess() && x.isAddConstantTime()
- && !x.isDense() && !y.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return Math.max(x.size() * x.getIteratorAdvanceCost(), y.size() * y.getIteratorAdvanceCost());
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- Iterator<Vector.Element> xi = x.all().iterator();
- Iterator<Vector.Element> yi = y.all().iterator();
- while (xi.hasNext() && yi.hasNext()) {
- Element xe = xi.next();
- x.setQuick(xe.index(), f.apply(xe.get(), yi.next().get()));
- }
- return x;
- }
- }
-
- public static class AssignAllIterateThisLookupThatMergeUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return !x.isAddConstantTime() && !x.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.size() * x.getIteratorAdvanceCost() * y.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping(false);
- for (Element xe : x.all()) {
- updates.set(xe.index(), f.apply(xe.get(), y.getQuick(xe.index())));
- }
- x.mergeUpdates(updates);
- return x;
- }
- }
-
- public static class AssignAllIterateThisLookupThatInplaceUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.isAddConstantTime() && !x.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.size() * x.getIteratorAdvanceCost() * y.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- for (Element xe : x.all()) {
- x.setQuick(xe.index(), f.apply(xe.get(), y.getQuick(xe.index())));
- }
- return x;
- }
- }
-
- public static class AssignAllIterateThatLookupThisMergeUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return !x.isAddConstantTime() && !y.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return y.size() * y.getIteratorAdvanceCost() * x.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping(false);
- for (Element ye : y.all()) {
- updates.set(ye.index(), f.apply(x.getQuick(ye.index()), ye.get()));
- }
- x.mergeUpdates(updates);
- return x;
- }
- }
-
- public static class AssignAllIterateThatLookupThisInplaceUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.isAddConstantTime() && !y.isDense();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return y.size() * y.getIteratorAdvanceCost() * x.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- for (Element ye : y.all()) {
- x.setQuick(ye.index(), f.apply(x.getQuick(ye.index()), ye.get()));
- }
- return x;
- }
- }
-
- public static class AssignAllLoopMergeUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return !x.isAddConstantTime();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.size() * x.getLookupCost() * y.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping(false);
- for (int i = 0; i < x.size(); ++i) {
- updates.set(i, f.apply(x.getQuick(i), y.getQuick(i)));
- }
- x.mergeUpdates(updates);
- return x;
- }
- }
-
- public static class AssignAllLoopInplaceUpdates extends VectorBinaryAssign {
-
- @Override
- public boolean isValid(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.isAddConstantTime();
- }
-
- @Override
- public double estimateCost(Vector x, Vector y, DoubleDoubleFunction f) {
- return x.size() * x.getLookupCost() * y.getLookupCost();
- }
-
- @Override
- public Vector assign(Vector x, Vector y, DoubleDoubleFunction f) {
- for (int i = 0; i < x.size(); ++i) {
- x.setQuick(i, f.apply(x.getQuick(i), y.getQuick(i)));
- }
- return x;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/VectorIterable.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/VectorIterable.java b/math/src/main/java/org/apache/mahout/math/VectorIterable.java
deleted file mode 100644
index 8414fdb..0000000
--- a/math/src/main/java/org/apache/mahout/math/VectorIterable.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Iterator;
-
-public interface VectorIterable extends Iterable<MatrixSlice> {
-
- /* Iterate all rows in order */
- Iterator<MatrixSlice> iterateAll();
-
- /* Iterate all non empty rows in arbitrary order */
- Iterator<MatrixSlice> iterateNonEmpty();
-
- int numSlices();
-
- int numRows();
-
- int numCols();
-
- /**
- * Return a new vector with cardinality equal to getNumRows() of this matrix which is the matrix product of the
- * recipient and the argument
- *
- * @param v a vector with cardinality equal to getNumCols() of the recipient
- * @return a new vector (typically a DenseVector)
- * @throws CardinalityException if this.getNumRows() != v.size()
- */
- Vector times(Vector v);
-
- /**
- * Convenience method for producing this.transpose().times(this.times(v)), which can be implemented with only one pass
- * over the matrix, without making the transpose() call (which can be expensive if the matrix is sparse)
- *
- * @param v a vector with cardinality equal to getNumCols() of the recipient
- * @return a new vector (typically a DenseVector) with cardinality equal to that of the argument.
- * @throws CardinalityException if this.getNumCols() != v.size()
- */
- Vector timesSquared(Vector v);
-
-}

r***@apache.org

2018-06-27 14:51:38 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/PivotedMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/PivotedMatrix.java b/math/src/main/java/org/apache/mahout/math/PivotedMatrix.java
deleted file mode 100644
index fba1e98..0000000
--- a/math/src/main/java/org/apache/mahout/math/PivotedMatrix.java
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.base.Preconditions;
-
-/**
- * Matrix that allows transparent row and column permutation.
- */
-public class PivotedMatrix extends AbstractMatrix {
-
- private Matrix base;
- private int[] rowPivot;
- private int[] rowUnpivot;
- private int[] columnPivot;
- private int[] columnUnpivot;
-
- public PivotedMatrix(Matrix base, int[] pivot) {
- this(base, pivot, java.util.Arrays.copyOf(pivot, pivot.length));
- }
- public PivotedMatrix(Matrix base, int[] rowPivot, int[] columnPivot) {
- super(base.rowSize(), base.columnSize());
-
- this.base = base;
- this.rowPivot = rowPivot;
- rowUnpivot = invert(rowPivot);
-
- this.columnPivot = columnPivot;
- columnUnpivot = invert(columnPivot);
- }
-
- public PivotedMatrix(Matrix base) {
- this(base, identityPivot(base.rowSize()),identityPivot(base.columnSize()));
- }
-
- /**
- * Swaps indexes i and j. This does both row and column permutation.
- *
- * @param i First index to swap.
- * @param j Second index to swap.
- */
- public void swap(int i, int j) {
- swapRows(i, j);
- swapColumns(i, j);
- }
-
- /**
- * Swaps indexes i and j. This does just row permutation.
- *
- * @param i First index to swap.
- * @param j Second index to swap.
- */
- public void swapRows(int i, int j) {
- swap(rowPivot, rowUnpivot, i, j);
- }
-
-
- /**
- * Swaps indexes i and j. This does just row permutation.
- *
- * @param i First index to swap.
- * @param j Second index to swap.
- */
- public void swapColumns(int i, int j) {
- swap(columnPivot, columnUnpivot, i, j);
- }
-
- private static void swap(int[] pivot, int[] unpivot, int i, int j) {
- Preconditions.checkPositionIndex(i, pivot.length);
- Preconditions.checkPositionIndex(j, pivot.length);
- if (i != j) {
- int tmp = pivot[i];
- pivot[i] = pivot[j];
- pivot[j] = tmp;
-
- unpivot[pivot[i]] = i;
- unpivot[pivot[j]] = j;
- }
- }
-
- /**
- * Assign the other vector values to the column of the receiver
- *
- * @param column the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws org.apache.mahout.math.CardinalityException
- * if the cardinalities differ
- */
- @Override
- public Matrix assignColumn(int column, Vector other) {
- // note the reversed pivoting for other
- return base.assignColumn(columnPivot[column], new PermutedVectorView(other, rowUnpivot, rowPivot));
- }
-
- /**
- * Assign the other vector values to the row of the receiver
- *
- * @param row the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws org.apache.mahout.math.CardinalityException
- * if the cardinalities differ
- */
- @Override
- public Matrix assignRow(int row, Vector other) {
- // note the reversed pivoting for other
- return base.assignRow(rowPivot[row], new PermutedVectorView(other, columnUnpivot, columnPivot));
- }
-
- /**
- * Return the column at the given index
- *
- * @param column an int column index
- * @return a Vector at the index
- * @throws org.apache.mahout.math.IndexException
- * if the index is out of bounds
- */
- @Override
- public Vector viewColumn(int column) {
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- return new PermutedVectorView(base.viewColumn(columnPivot[column]), rowPivot, rowUnpivot);
- }
-
- /**
- * Return the row at the given index
- *
- * @param row an int row index
- * @return a Vector at the index
- * @throws org.apache.mahout.math.IndexException
- * if the index is out of bounds
- */
- @Override
- public Vector viewRow(int row) {
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- return new PermutedVectorView(base.viewRow(rowPivot[row]), columnPivot, columnUnpivot);
- }
-
- /**
- * Return the value at the given indexes, without checking bounds
- *
- * @param row an int row index
- * @param column an int column index
- * @return the double at the index
- */
- @Override
- public double getQuick(int row, int column) {
- return base.getQuick(rowPivot[row], columnPivot[column]);
- }
-
- /**
- * Return an empty matrix of the same underlying class as the receiver
- *
- * @return a Matrix
- */
- @Override
- public Matrix like() {
- return new PivotedMatrix(base.like());
- }
-
-
- @Override
- public Matrix clone() {
- PivotedMatrix clone = (PivotedMatrix) super.clone();
-
- base = base.clone();
- rowPivot = rowPivot.clone();
- rowUnpivot = rowUnpivot.clone();
- columnPivot = columnPivot.clone();
- columnUnpivot = columnUnpivot.clone();
-
- return clone;
- }
-
-
- /**
- * Returns an empty matrix of the same underlying class as the receiver and of the specified
- * size.
- *
- * @param rows the int number of rows
- * @param columns the int number of columns
- */
- @Override
- public Matrix like(int rows, int columns) {
- return new PivotedMatrix(base.like(rows, columns));
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param row an int row index into the receiver
- * @param column an int column index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int row, int column, double value) {
- base.setQuick(rowPivot[row], columnPivot[column], value);
- }
-
- /**
- * Return the number of values in the recipient
- *
- * @return an int[2] containing [row, column] count
- */
- @Override
- public int[] getNumNondefaultElements() {
- return base.getNumNondefaultElements();
- }
-
- /**
- * Return a new matrix containing the subset of the recipient
- *
- * @param offset an int[2] offset into the receiver
- * @param size the int[2] size of the desired result
- * @return a new Matrix that is a view of the original
- * @throws org.apache.mahout.math.CardinalityException
- * if the length is greater than the cardinality of the receiver
- * @throws org.apache.mahout.math.IndexException
- * if the offset is negative or the offset+length is outside of the receiver
- */
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- return new MatrixView(this, offset, size);
- }
-
- public int rowUnpivot(int k) {
- return rowUnpivot[k];
- }
-
- public int columnUnpivot(int k) {
- return columnUnpivot[k];
- }
-
- public int[] getRowPivot() {
- return rowPivot;
- }
-
- public int[] getInverseRowPivot() {
- return rowUnpivot;
- }
-
- public int[] getColumnPivot() {
- return columnPivot;
- }
-
- public int[] getInverseColumnPivot() {
- return columnUnpivot;
- }
-
- public Matrix getBase() {
- return base;
- }
-
- private static int[] identityPivot(int n) {
- int[] pivot = new int[n];
- for (int i = 0; i < n; i++) {
- pivot[i] = i;
- }
- return pivot;
- }
-
- private static int[] invert(int[] pivot) {
- int[] x = new int[pivot.length];
- for (int i = 0; i < pivot.length; i++) {
- x[pivot[i]] = i;
- }
- return x;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/QR.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/QR.java b/math/src/main/java/org/apache/mahout/math/QR.java
deleted file mode 100644
index 5992224..0000000
--- a/math/src/main/java/org/apache/mahout/math/QR.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.package org.apache.mahout.math;
- */
-package org.apache.mahout.math;
-
-public interface QR {
- Matrix getQ();
-
- Matrix getR();
-
- boolean hasFullRank();
-
- Matrix solve(Matrix B);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/QRDecomposition.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/QRDecomposition.java b/math/src/main/java/org/apache/mahout/math/QRDecomposition.java
deleted file mode 100644
index ab5b3d2..0000000
--- a/math/src/main/java/org/apache/mahout/math/QRDecomposition.java
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Copyright 1999 CERN - European Organization for Nuclear Research.
- * Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
- * is hereby granted without fee, provided that the above copyright notice appear in all copies and
- * that both that copyright notice and this permission notice appear in supporting documentation.
- * CERN makes no representations about the suitability of this software for any purpose.
- * It is provided "as is" without expressed or implied warranty.
- */
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.function.Functions;
-
-import java.util.Locale;
-
-/**
- For an <tt>m x n</tt> matrix <tt>A</tt> with {@code m >= n}, the QR decomposition is an <tt>m x n</tt>
- orthogonal matrix <tt>Q</tt> and an <tt>n x n</tt> upper triangular matrix <tt>R</tt> so that
- <tt>A = Q*R</tt>.
- 
- The QR decomposition always exists, even if the matrix does not have
- full rank, so the constructor will never fail. The primary use of the
- QR decomposition is in the least squares solution of non-square systems
- of simultaneous linear equations. This will fail if <tt>isFullRank()</tt>
- returns <tt>false</tt>.
- */
-
-public class QRDecomposition implements QR {
- private final Matrix q;
- private final Matrix r;
- private final Matrix mType;
- private final boolean fullRank;
- private final int rows;
- private final int columns;
-
- /**
- * Constructs and returns a new QR decomposition object; computed by Householder reflections; The
- * decomposed matrices can be retrieved via instance methods of the returned decomposition
- * object.
- *
- * @param a A rectangular matrix.
- * @throws IllegalArgumentException if {@code A.rows() < A.columns()}.
- */
- public QRDecomposition(Matrix a) {
-
- rows = a.rowSize();
- int min = Math.min(a.rowSize(), a.columnSize());
- columns = a.columnSize();
- mType = a.like(1,1);
-
- Matrix qTmp = a.clone();
-
- boolean fullRank = true;
-
- r = new DenseMatrix(min, columns);
-
- for (int i = 0; i < min; i++) {
- Vector qi = qTmp.viewColumn(i);
- double alpha = qi.norm(2);
- if (Math.abs(alpha) > Double.MIN_VALUE) {
- qi.assign(Functions.div(alpha));
- } else {
- if (Double.isInfinite(alpha) || Double.isNaN(alpha)) {
- throw new ArithmeticException("Invalid intermediate result");
- }
- fullRank = false;
- }
- r.set(i, i, alpha);
-
- for (int j = i + 1; j < columns; j++) {
- Vector qj = qTmp.viewColumn(j);
- double norm = qj.norm(2);
- if (Math.abs(norm) > Double.MIN_VALUE) {
- double beta = qi.dot(qj);
- r.set(i, j, beta);
- if (j < min) {
- qj.assign(qi, Functions.plusMult(-beta));
- }
- } else {
- if (Double.isInfinite(norm) || Double.isNaN(norm)) {
- throw new ArithmeticException("Invalid intermediate result");
- }
- }
- }
- }
- if (columns > min) {
- q = qTmp.viewPart(0, rows, 0, min).clone();
- } else {
- q = qTmp;
- }
- this.fullRank = fullRank;
- }
-
- /**
- * Generates and returns the (economy-sized) orthogonal factor <tt>Q</tt>.
- *
- * @return <tt>Q</tt>
- */
- @Override
- public Matrix getQ() {
- return q;
- }
-
- /**
- * Returns the upper triangular factor, <tt>R</tt>.
- *
- * @return <tt>R</tt>
- */
- @Override
- public Matrix getR() {
- return r;
- }
-
- /**
- * Returns whether the matrix <tt>A</tt> has full rank.
- *
- * @return true if <tt>R</tt>, and hence <tt>A</tt>, has full rank.
- */
- @Override
- public boolean hasFullRank() {
- return fullRank;
- }
-
- /**
- * Least squares solution of <tt>A*X = B</tt>; <tt>returns X</tt>.
- *
- * @param B A matrix with as many rows as <tt>A</tt> and any number of columns.
- * @return <tt>X</tt> that minimizes the two norm of <tt>Q*R*X - B</tt>.
- * @throws IllegalArgumentException if <tt>B.rows() != A.rows()</tt>.
- */
- @Override
- public Matrix solve(Matrix B) {
- if (B.numRows() != rows) {
- throw new IllegalArgumentException("Matrix row dimensions must agree.");
- }
-
- int cols = B.numCols();
- Matrix x = mType.like(columns, cols);
-
- // this can all be done a bit more efficiently if we don't actually
- // form explicit versions of Q^T and R but this code isn't so bad
- // and it is much easier to understand
- Matrix qt = getQ().transpose();
- Matrix y = qt.times(B);
-
- Matrix r = getR();
- for (int k = Math.min(columns, rows) - 1; k >= 0; k--) {
- // X[k,] = Y[k,] / R[k,k], note that X[k,] starts with 0 so += is same as =
- x.viewRow(k).assign(y.viewRow(k), Functions.plusMult(1 / r.get(k, k)));
-
- // Y[0:(k-1),] -= R[0:(k-1),k] * X[k,]
- Vector rColumn = r.viewColumn(k).viewPart(0, k);
- for (int c = 0; c < cols; c++) {
- y.viewColumn(c).viewPart(0, k).assign(rColumn, Functions.plusMult(-x.get(k, c)));
- }
- }
- return x;
- }
-
- /**
- * Returns a rough string rendition of a QR.
- */
- @Override
- public String toString() {
- return String.format(Locale.ENGLISH, "QR(%d x %d,fullRank=%s)", rows, columns, hasFullRank());
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java b/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
deleted file mode 100644
index c325078..0000000
--- a/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
+++ /dev/null
@@ -1,303 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import it.unimi.dsi.fastutil.doubles.DoubleIterator;
-import it.unimi.dsi.fastutil.ints.Int2DoubleMap;
-import it.unimi.dsi.fastutil.ints.Int2DoubleMap.Entry;
-import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap;
-import it.unimi.dsi.fastutil.objects.ObjectIterator;
-
-import java.util.Iterator;
-import java.util.NoSuchElementException;
-
-import org.apache.mahout.math.set.AbstractSet;
-
-/** Implements vector that only stores non-zero doubles */
-public class RandomAccessSparseVector extends AbstractVector {
-
- private static final int INITIAL_CAPACITY = 11;
-
- private Int2DoubleOpenHashMap values;
-
- /** For serialization purposes only. */
- public RandomAccessSparseVector() {
- super(0);
- }
-
- public RandomAccessSparseVector(int cardinality) {
- this(cardinality, Math.min(cardinality, INITIAL_CAPACITY)); // arbitrary estimate of 'sparseness'
- }
-
- public RandomAccessSparseVector(int cardinality, int initialCapacity) {
- super(cardinality);
- values = new Int2DoubleOpenHashMap(initialCapacity, .5f);
- }
-
- public RandomAccessSparseVector(Vector other) {
- this(other.size(), other.getNumNondefaultElements());
- for (Element e : other.nonZeroes()) {
- values.put(e.index(), e.get());
- }
- }
-
- private RandomAccessSparseVector(int cardinality, Int2DoubleOpenHashMap values) {
- super(cardinality);
- this.values = values;
- }
-
- public RandomAccessSparseVector(RandomAccessSparseVector other, boolean shallowCopy) {
- super(other.size());
- values = shallowCopy ? other.values : other.values.clone();
- }
-
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- return new SparseMatrix(rows, columns);
- }
-
- @Override
- public RandomAccessSparseVector clone() {
- return new RandomAccessSparseVector(size(), values.clone());
- }
-
- @Override
- public String toString() {
- return sparseVectorToString();
- }
-
- @Override
- public Vector assign(Vector other) {
- if (size() != other.size()) {
- throw new CardinalityException(size(), other.size());
- }
- values.clear();
- for (Element e : other.nonZeroes()) {
- setQuick(e.index(), e.get());
- }
- return this;
- }
-
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- for (int i = 0; i < updates.getNumMappings(); ++i) {
- values.put(updates.getIndices()[i], updates.getValues()[i]);
- }
- }
-
- /**
- * @return false
- */
- @Override
- public boolean isDense() {
- return false;
- }
-
- /**
- * @return false
- */
- @Override
- public boolean isSequentialAccess() {
- return false;
- }
-
- @Override
- public double getQuick(int index) {
- return values.get(index);
- }
-
- @Override
- public void setQuick(int index, double value) {
- invalidateCachedLength();
- if (value == 0.0) {
- values.remove(index);
- } else {
- values.put(index, value);
- }
- }
-
- @Override
- public void incrementQuick(int index, double increment) {
- invalidateCachedLength();
- values.addTo( index, increment);
- }
-
-
- @Override
- public RandomAccessSparseVector like() {
- return new RandomAccessSparseVector(size(), values.size());
- }
-
- @Override
- public Vector like(int cardinality) {
- return new RandomAccessSparseVector(cardinality, values.size());
- }
-
- @Override
- public int getNumNondefaultElements() {
- return values.size();
- }
-
- @Override
- public int getNumNonZeroElements() {
- final DoubleIterator iterator = values.values().iterator();
- int numNonZeros = 0;
- for( int i = values.size(); i-- != 0; ) if ( iterator.nextDouble() != 0 ) numNonZeros++;
- return numNonZeros;
- }
-
- @Override
- public double getLookupCost() {
- return 1;
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return 1 + (AbstractSet.DEFAULT_MAX_LOAD_FACTOR + AbstractSet.DEFAULT_MIN_LOAD_FACTOR) / 2;
- }
-
- /**
- * This is "sort of" constant, but really it might resize the array.
- */
- @Override
- public boolean isAddConstantTime() {
- return true;
- }
-
- /*
- @Override
- public Element getElement(int index) {
- // TODO: this should return a MapElement so as to avoid hashing for both getQuick and setQuick.
- return super.getElement(index);
- }
- */
-
- private final class NonZeroIterator implements Iterator<Element> {
- final ObjectIterator<Int2DoubleMap.Entry> fastIterator = values.int2DoubleEntrySet().fastIterator();
- final RandomAccessElement element = new RandomAccessElement( fastIterator );
-
- @Override
- public boolean hasNext() {
- return fastIterator.hasNext();
- }
-
- @Override
- public Element next() {
- if ( ! hasNext() ) throw new NoSuchElementException();
- element.entry = fastIterator.next();
- return element;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- final class RandomAccessElement implements Element {
- Int2DoubleMap.Entry entry;
- final ObjectIterator<Int2DoubleMap.Entry> fastIterator;
-
- public RandomAccessElement( ObjectIterator<Entry> fastIterator ) {
- super();
- this.fastIterator = fastIterator;
- }
-
- @Override
- public double get() {
- return entry.getDoubleValue();
- }
-
- @Override
- public int index() {
- return entry.getIntKey();
- }
-
- @Override
- public void set( double value ) {
- invalidateCachedLength();
- if (value == 0.0) fastIterator.remove();
- else entry.setValue( value );
- }
- }
- /**
- * NOTE: this implementation reuses the Vector.Element instance for each call of next(). If you need to preserve the
- * instance, you need to make a copy of it
- *
- * @return an {@link Iterator} over the Elements.
- * @see #getElement(int)
- */
- @Override
- public Iterator<Element> iterateNonZero() {
- return new NonZeroIterator();
- }
-
- @Override
- public Iterator<Element> iterator() {
- return new AllIterator();
- }
-
- final class GeneralElement implements Element {
- int index;
- double value;
-
- @Override
- public double get() {
- return value;
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set( double value ) {
- invalidateCachedLength();
- if (value == 0.0) values.remove( index );
- else values.put( index, value );
- }
-}
-
- private final class AllIterator implements Iterator<Element> {
- private final GeneralElement element = new GeneralElement();
-
- private AllIterator() {
- element.index = -1;
- }
-
- @Override
- public boolean hasNext() {
- return element.index + 1 < size();
- }
-
- @Override
- public Element next() {
- if (!hasNext()) {
- throw new NoSuchElementException();
- }
- element.value = values.get( ++element.index );
- return element;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/RandomTrinaryMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/RandomTrinaryMatrix.java b/math/src/main/java/org/apache/mahout/math/RandomTrinaryMatrix.java
deleted file mode 100644
index 85de0cd..0000000
--- a/math/src/main/java/org/apache/mahout/math/RandomTrinaryMatrix.java
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.nio.ByteBuffer;
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * Random matrix. Each value is taken from {-1,0,1} with roughly equal probability. Note
- * that by default, the value is determined by a relatively simple hash of the coordinates.
- * Such a hash is not usable where real randomness is required, but suffices nicely for
- * random projection methods.
- *
- * If the simple hash method is not satisfactory, an optional high quality mode is available
- * which uses a murmur hash of the coordinates.
- */
-public class RandomTrinaryMatrix extends AbstractMatrix {
- private static final AtomicInteger ID = new AtomicInteger();
- private static final int PRIME1 = 104047;
- private static final int PRIME2 = 101377;
- private static final int PRIME3 = 64661;
- private static final long SCALE = 1L << 32;
-
- private final int seed;
-
- // set this to true to use a high quality hash
- private boolean highQuality = false;
-
- public RandomTrinaryMatrix(int seed, int rows, int columns, boolean highQuality) {
- super(rows, columns);
-
- this.highQuality = highQuality;
- this.seed = seed;
- }
-
- public RandomTrinaryMatrix(int rows, int columns) {
- this(ID.incrementAndGet(), rows, columns, false);
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- throw new UnsupportedOperationException("Can't assign to read-only matrix");
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- throw new UnsupportedOperationException("Can't assign to read-only matrix");
- }
-
- /**
- * Return the value at the given indexes, without checking bounds
- *
- * @param row an int row index
- * @param column an int column index
- * @return the double at the index
- */
- @Override
- public double getQuick(int row, int column) {
- if (highQuality) {
- ByteBuffer buf = ByteBuffer.allocate(8);
- buf.putInt(row);
- buf.putInt(column);
- buf.flip();
- return (MurmurHash.hash64A(buf, seed) & (SCALE - 1)) / (double) SCALE;
- } else {
- // this isn't a fantastic random number generator, but it is just fine for random projections
- return ((((row * PRIME1) + column * PRIME2 + row * column * PRIME3) & 8) * 0.25) - 1;
- }
- }
-
-
- /**
- * Return an empty matrix of the same underlying class as the receiver
- *
- * @return a Matrix
- */
- @Override
- public Matrix like() {
- return new DenseMatrix(rowSize(), columnSize());
- }
-
- /**
- * Returns an empty matrix of the same underlying class as the receiver and of the specified
- * size.
- *
- * @param rows the int number of rows
- * @param columns the int number of columns
- */
- @Override
- public Matrix like(int rows, int columns) {
- return new DenseMatrix(rows, columns);
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param row an int row index into the receiver
- * @param column an int column index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int row, int column, double value) {
- throw new UnsupportedOperationException("Can't assign to read-only matrix");
- }
-
- /**
- * Return the number of values in the recipient
- *
- * @return an int[2] containing [row, column] count
- */
- @Override
- public int[] getNumNondefaultElements() {
- throw new UnsupportedOperationException("Can't assign to read-only matrix");
- }
-
- /**
- * Return a new matrix containing the subset of the recipient
- *
- * @param offset an int[2] offset into the receiver
- * @param size the int[2] size of the desired result
- * @return a new Matrix that is a view of the original
- * @throws org.apache.mahout.math.CardinalityException
- * if the length is greater than the cardinality of the receiver
- * @throws org.apache.mahout.math.IndexException
- * if the offset is negative or the offset+length is outside of the receiver
- */
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- return new MatrixView(this, offset, size);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java b/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
deleted file mode 100644
index f7d67a7..0000000
--- a/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
+++ /dev/null
@@ -1,379 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.NoSuchElementException;
-
-import com.google.common.primitives.Doubles;
-import org.apache.mahout.math.function.Functions;
-
-/**
- * 
- * Implements vector that only stores non-zero doubles as a pair of parallel arrays (OrderedIntDoubleMapping),
- * one int[], one double[]. If there are k non-zero elements in the vector, this implementation has
- * O(log(k)) random-access read performance, and O(k) random-access write performance, which is far below that
- * of the hashmap based {@link org.apache.mahout.math.RandomAccessSparseVector RandomAccessSparseVector}. This
- * class is primarily used for operations where the all the elements will be accessed in a read-only fashion
- * sequentially: methods which operate not via get() or set(), but via iterateNonZero(), such as (but not limited
- * to) :
- * <ul>
- * <li>dot(Vector)</li>
- * <li>addTo(Vector)</li>
- * </ul>
- *
- * See {@link OrderedIntDoubleMapping}
- */
-public class SequentialAccessSparseVector extends AbstractVector {
-
- private OrderedIntDoubleMapping values;
-
- /** For serialization purposes only. */
- public SequentialAccessSparseVector() {
- super(0);
- }
-
- public SequentialAccessSparseVector(int cardinality) {
- this(cardinality, Math.min(100, cardinality / 1000 < 10 ? 10 : cardinality / 1000)); // arbitrary estimate of
- // 'sparseness'
- }
-
- public SequentialAccessSparseVector(int cardinality, int size) {
- super(cardinality);
- values = new OrderedIntDoubleMapping(size);
- }
-
- public SequentialAccessSparseVector(Vector other) {
- this(other.size(), other.getNumNondefaultElements());
-
- if (other.isSequentialAccess()) {
- for (Element e : other.nonZeroes()) {
- set(e.index(), e.get());
- }
- } else {
- // If the incoming Vector to copy is random, then adding items
- // from the Iterator can degrade performance dramatically if
- // the number of elements is large as this Vector tries to stay
- // in order as items are added, so it's better to sort the other
- // Vector's elements by index and then add them to this
- copySortedRandomAccessSparseVector(other);
- }
- }
-
- // Sorts a RandomAccessSparseVectors Elements before adding them to this
- private int copySortedRandomAccessSparseVector(Vector other) {
- int elementCount = other.getNumNondefaultElements();
- OrderedElement[] sortableElements = new OrderedElement[elementCount];
- int s = 0;
- for (Element e : other.nonZeroes()) {
- sortableElements[s++] = new OrderedElement(e.index(), e.get());
- }
- Arrays.sort(sortableElements);
- for (int i = 0; i < sortableElements.length; i++) {
- values.setIndexAt(i, sortableElements[i].index);
- values.setValueAt(i, sortableElements[i].value);
- }
- values = new OrderedIntDoubleMapping(values.getIndices(), values.getValues(), elementCount);
- return elementCount;
- }
-
- public SequentialAccessSparseVector(SequentialAccessSparseVector other, boolean shallowCopy) {
- super(other.size());
- values = shallowCopy ? other.values : other.values.clone();
- }
-
- public SequentialAccessSparseVector(SequentialAccessSparseVector other) {
- this(other.size(), other.getNumNondefaultElements());
- values = other.values.clone();
- }
-
- private SequentialAccessSparseVector(int cardinality, OrderedIntDoubleMapping values) {
- super(cardinality);
- this.values = values;
- }
-
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- //return new SparseRowMatrix(rows, columns);
- return new SparseMatrix(rows, columns);
- }
-
- @SuppressWarnings("CloneDoesntCallSuperClone")
- @Override
- public SequentialAccessSparseVector clone() {
- return new SequentialAccessSparseVector(size(), values.clone());
- }
-
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- values.merge(updates);
- }
-
- @Override
- public String toString() {
- return sparseVectorToString();
- }
-
- /**
- * @return false
- */
- @Override
- public boolean isDense() {
- return false;
- }
-
- /**
- * @return true
- */
- @Override
- public boolean isSequentialAccess() {
- return true;
- }
-
- /**
- * Warning! This takes O(log n) time as it does a binary search behind the scenes!
- * Only use it when STRICTLY necessary.
- * @param index an int index.
- * @return the value at that position in the vector.
- */
- @Override
- public double getQuick(int index) {
- return values.get(index);
- }
-
- /**
- * Warning! This takes O(log n) time as it does a binary search behind the scenes!
- * Only use it when STRICTLY necessary.
- * @param index an int index.
- */
- @Override
- public void setQuick(int index, double value) {
- invalidateCachedLength();
- values.set(index, value);
- }
-
- @Override
- public void incrementQuick(int index, double increment) {
- invalidateCachedLength();
- values.increment(index, increment);
- }
-
- @Override
- public SequentialAccessSparseVector like() {
- return new SequentialAccessSparseVector(size(), values.getNumMappings());
- }
-
- @Override
- public Vector like(int cardinality) {
- return new SequentialAccessSparseVector(cardinality);
- }
-
- @Override
- public int getNumNondefaultElements() {
- return values.getNumMappings();
- }
-
- @Override
- public int getNumNonZeroElements() {
- double[] elementValues = values.getValues();
- int numMappedElements = values.getNumMappings();
- int numNonZeros = 0;
- for (int index = 0; index < numMappedElements; index++) {
- if (elementValues[index] != 0) {
- numNonZeros++;
- }
- }
- return numNonZeros;
- }
-
- @Override
- public double getLookupCost() {
- return Math.max(1, Math.round(Functions.LOG2.apply(getNumNondefaultElements())));
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return 1;
- }
-
- @Override
- public boolean isAddConstantTime() {
- return false;
- }
-
- @Override
- public Iterator<Element> iterateNonZero() {
-
- // TODO: this is a bug, since nonDefaultIterator doesn't hold to non-zero contract.
- return new NonDefaultIterator();
- }
-
- @Override
- public Iterator<Element> iterator() {
- return new AllIterator();
- }
-
- private final class NonDefaultIterator implements Iterator<Element> {
- private final NonDefaultElement element = new NonDefaultElement();
-
- @Override
- public boolean hasNext() {
- return element.getNextOffset() < values.getNumMappings();
- }
-
- @Override
- public Element next() {
- if (!hasNext()) {
- throw new NoSuchElementException();
- }
- element.advanceOffset();
- return element;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- private final class AllIterator implements Iterator<Element> {
- private final AllElement element = new AllElement();
-
- @Override
- public boolean hasNext() {
- return element.getNextIndex() < SequentialAccessSparseVector.this.size();
- }
-
- @Override
- public Element next() {
- if (!hasNext()) {
- throw new NoSuchElementException();
- }
-
- element.advanceIndex();
- return element;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- private final class NonDefaultElement implements Element {
- private int offset = -1;
-
- void advanceOffset() {
- offset++;
- }
-
- int getNextOffset() {
- return offset + 1;
- }
-
- @Override
- public double get() {
- return values.getValues()[offset];
- }
-
- @Override
- public int index() {
- return values.getIndices()[offset];
- }
-
- @Override
- public void set(double value) {
- invalidateCachedLength();
- values.setValueAt(offset, value);
- }
- }
-
- private final class AllElement implements Element {
- private int index = -1;
- private int nextOffset;
-
- void advanceIndex() {
- index++;
- if (nextOffset < values.getNumMappings() && index > values.getIndices()[nextOffset]) {
- nextOffset++;
- }
- }
-
- int getNextIndex() {
- return index + 1;
- }
-
- @Override
- public double get() {
- if (nextOffset < values.getNumMappings() && index == values.getIndices()[nextOffset]) {
- return values.getValues()[nextOffset];
- } else {
- return OrderedIntDoubleMapping.DEFAULT_VALUE;
- }
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set(double value) {
- invalidateCachedLength();
- if (nextOffset < values.getNumMappings() && index == values.indexAt(nextOffset)) {
- values.setValueAt(nextOffset, value);
- } else {
- // Yes, this works; the offset into indices of the new value's index will still be nextOffset
- values.set(index, value);
- }
- }
- }
-
- // Comparable Element for sorting Elements by index
- private static final class OrderedElement implements Comparable<OrderedElement> {
- private final int index;
- private final double value;
-
- OrderedElement(int index, double value) {
- this.index = index;
- this.value = value;
- }
-
- @Override
- public int compareTo(OrderedElement that) {
- // both indexes are positive, and neither can be Integer.MAX_VALUE (otherwise there would be
- // an array somewhere with Integer.MAX_VALUE + 1 elements)
- return this.index - that.index;
- }
-
- @Override
- public int hashCode() {
- return index ^ Doubles.hashCode(value);
- }
-
- @Override
- public boolean equals(Object o) {
- if (!(o instanceof OrderedElement)) {
- return false;
- }
- OrderedElement other = (OrderedElement) o;
- return index == other.index && value == other.value;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java b/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java
deleted file mode 100644
index 2abff10..0000000
--- a/math/src/main/java/org/apache/mahout/math/SingularValueDecomposition.java
+++ /dev/null
@@ -1,669 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Copyright 1999 CERN - European Organization for Nuclear Research.
- * Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
- * is hereby granted without fee, provided that the above copyright notice appear in all copies and
- * that both that copyright notice and this permission notice appear in supporting documentation.
- * CERN makes no representations about the suitability of this software for any purpose.
- * It is provided "as is" without expressed or implied warranty.
- */
-package org.apache.mahout.math;
-
-public class SingularValueDecomposition implements java.io.Serializable {
-
- /** Arrays for internal storage of U and V. */
- private final double[][] u;
- private final double[][] v;
-
- /** Array for internal storage of singular values. */
- private final double[] s;
-
- /** Row and column dimensions. */
- private final int m;
- private final int n;
-
- /**To handle the case where numRows() < numCols() and to use the fact that SVD(A')=VSU'=> SVD(A')'=SVD(A)**/
- private boolean transpositionNeeded;
-
- /**
- * Constructs and returns a new singular value decomposition object; The
- * decomposed matrices can be retrieved via instance methods of the returned
- * decomposition object.
- *
- * @param arg
- * A rectangular matrix.
- */
- public SingularValueDecomposition(Matrix arg) {
- if (arg.numRows() < arg.numCols()) {
- transpositionNeeded = true;
- }
-
- // Derived from LINPACK code.
- // Initialize.
- double[][] a;
- if (transpositionNeeded) {
- //use the transpose Matrix
- m = arg.numCols();
- n = arg.numRows();
- a = new double[m][n];
- for (int i = 0; i < m; i++) {
- for (int j = 0; j < n; j++) {
- a[i][j] = arg.get(j, i);
- }
- }
- } else {
- m = arg.numRows();
- n = arg.numCols();
- a = new double[m][n];
- for (int i = 0; i < m; i++) {
- for (int j = 0; j < n; j++) {
- a[i][j] = arg.get(i, j);
- }
- }
- }
-
-
- int nu = Math.min(m, n);
- s = new double[Math.min(m + 1, n)];
- u = new double[m][nu];
- v = new double[n][n];
- double[] e = new double[n];
- double[] work = new double[m];
- boolean wantu = true;
- boolean wantv = true;
-
- // Reduce A to bidiagonal form, storing the diagonal elements
- // in s and the super-diagonal elements in e.
-
- int nct = Math.min(m - 1, n);
- int nrt = Math.max(0, Math.min(n - 2, m));
- for (int k = 0; k < Math.max(nct, nrt); k++) {
- if (k < nct) {
-
- // Compute the transformation for the k-th column and
- // place the k-th diagonal in s[k].
- // Compute 2-norm of k-th column without under/overflow.
- s[k] = 0;
- for (int i = k; i < m; i++) {
- s[k] = Algebra.hypot(s[k], a[i][k]);
- }
- if (s[k] != 0.0) {
- if (a[k][k] < 0.0) {
- s[k] = -s[k];
- }
- for (int i = k; i < m; i++) {
- a[i][k] /= s[k];
- }
- a[k][k] += 1.0;
- }
- s[k] = -s[k];
- }
- for (int j = k + 1; j < n; j++) {
- if (k < nct && s[k] != 0.0) {
-
- // Apply the transformation.
-
- double t = 0;
- for (int i = k; i < m; i++) {
- t += a[i][k] * a[i][j];
- }
- t = -t / a[k][k];
- for (int i = k; i < m; i++) {
- a[i][j] += t * a[i][k];
- }
- }
-
- // Place the k-th row of A into e for the
- // subsequent calculation of the row transformation.
-
- e[j] = a[k][j];
- }
- if (wantu && k < nct) {
-
- // Place the transformation in U for subsequent back
- // multiplication.
-
- for (int i = k; i < m; i++) {
- u[i][k] = a[i][k];
- }
- }
- if (k < nrt) {
-
- // Compute the k-th row transformation and place the
- // k-th super-diagonal in e[k].
- // Compute 2-norm without under/overflow.
- e[k] = 0;
- for (int i = k + 1; i < n; i++) {
- e[k] = Algebra.hypot(e[k], e[i]);
- }
- if (e[k] != 0.0) {
- if (e[k + 1] < 0.0) {
- e[k] = -e[k];
- }
- for (int i = k + 1; i < n; i++) {
- e[i] /= e[k];
- }
- e[k + 1] += 1.0;
- }
- e[k] = -e[k];
- if (k + 1 < m && e[k] != 0.0) {
-
- // Apply the transformation.
-
- for (int i = k + 1; i < m; i++) {
- work[i] = 0.0;
- }
- for (int j = k + 1; j < n; j++) {
- for (int i = k + 1; i < m; i++) {
- work[i] += e[j] * a[i][j];
- }
- }
- for (int j = k + 1; j < n; j++) {
- double t = -e[j] / e[k + 1];
- for (int i = k + 1; i < m; i++) {
- a[i][j] += t * work[i];
- }
- }
- }
- if (wantv) {
-
- // Place the transformation in V for subsequent
- // back multiplication.
-
- for (int i = k + 1; i < n; i++) {
- v[i][k] = e[i];
- }
- }
- }
- }
-
- // Set up the final bidiagonal matrix or order p.
-
- int p = Math.min(n, m + 1);
- if (nct < n) {
- s[nct] = a[nct][nct];
- }
- if (m < p) {
- s[p - 1] = 0.0;
- }
- if (nrt + 1 < p) {
- e[nrt] = a[nrt][p - 1];
- }
- e[p - 1] = 0.0;
-
- // If required, generate U.
-
- if (wantu) {
- for (int j = nct; j < nu; j++) {
- for (int i = 0; i < m; i++) {
- u[i][j] = 0.0;
- }
- u[j][j] = 1.0;
- }
- for (int k = nct - 1; k >= 0; k--) {
- if (s[k] != 0.0) {
- for (int j = k + 1; j < nu; j++) {
- double t = 0;
- for (int i = k; i < m; i++) {
- t += u[i][k] * u[i][j];
- }
- t = -t / u[k][k];
- for (int i = k; i < m; i++) {
- u[i][j] += t * u[i][k];
- }
- }
- for (int i = k; i < m; i++) {
- u[i][k] = -u[i][k];
- }
- u[k][k] = 1.0 + u[k][k];
- for (int i = 0; i < k - 1; i++) {
- u[i][k] = 0.0;
- }
- } else {
- for (int i = 0; i < m; i++) {
- u[i][k] = 0.0;
- }
- u[k][k] = 1.0;
- }
- }
- }
-
- // If required, generate V.
-
- if (wantv) {
- for (int k = n - 1; k >= 0; k--) {
- if (k < nrt && e[k] != 0.0) {
- for (int j = k + 1; j < nu; j++) {
- double t = 0;
- for (int i = k + 1; i < n; i++) {
- t += v[i][k] * v[i][j];
- }
- t = -t / v[k + 1][k];
- for (int i = k + 1; i < n; i++) {
- v[i][j] += t * v[i][k];
- }
- }
- }
- for (int i = 0; i < n; i++) {
- v[i][k] = 0.0;
- }
- v[k][k] = 1.0;
- }
- }
-
- // Main iteration loop for the singular values.
-
- int pp = p - 1;
- int iter = 0;
- double eps = Math.pow(2.0, -52.0);
- double tiny = Math.pow(2.0,-966.0);
- while (p > 0) {
- int k;
-
- // Here is where a test for too many iterations would go.
-
- // This section of the program inspects for
- // negligible elements in the s and e arrays. On
- // completion the variables kase and k are set as follows.
-
- // kase = 1 if s(p) and e[k-1] are negligible and k<p
- // kase = 2 if s(k) is negligible and k= -1; k--) {
- if (k == -1) {
- break;
- }
- if (Math.abs(e[k]) <= tiny +eps * (Math.abs(s[k]) + Math.abs(s[k + 1]))) {
- e[k] = 0.0;
- break;
- }
- }
- int kase;
- if (k == p - 2) {
- kase = 4;
- } else {
- int ks;
- for (ks = p - 1; ks >= k; ks--) {
- if (ks == k) {
- break;
- }
- double t =
- (ks != p ? Math.abs(e[ks]) : 0.) +
- (ks != k + 1 ? Math.abs(e[ks-1]) : 0.);
- if (Math.abs(s[ks]) <= tiny + eps * t) {
- s[ks] = 0.0;
- break;
- }
- }
- if (ks == k) {
- kase = 3;
- } else if (ks == p - 1) {
- kase = 1;
- } else {
- kase = 2;
- k = ks;
- }
- }
- k++;
-
- // Perform the task indicated by kase.
-
- switch (kase) {
-
- // Deflate negligible s(p).
-
- case 1: {
- double f = e[p - 2];
- e[p - 2] = 0.0;
- for (int j = p - 2; j >= k; j--) {
- double t = Algebra.hypot(s[j], f);
- double cs = s[j] / t;
- double sn = f / t;
- s[j] = t;
- if (j != k) {
- f = -sn * e[j - 1];
- e[j - 1] = cs * e[j - 1];
- }
- if (wantv) {
- for (int i = 0; i < n; i++) {
- t = cs * v[i][j] + sn * v[i][p - 1];
- v[i][p - 1] = -sn * v[i][j] + cs * v[i][p - 1];
- v[i][j] = t;
- }
- }
- }
- }
- break;
-
- // Split at negligible s(k).
-
- case 2: {
- double f = e[k - 1];
- e[k - 1] = 0.0;
- for (int j = k; j < p; j++) {
- double t = Algebra.hypot(s[j], f);
- double cs = s[j] / t;
- double sn = f / t;
- s[j] = t;
- f = -sn * e[j];
- e[j] = cs * e[j];
- if (wantu) {
- for (int i = 0; i < m; i++) {
- t = cs * u[i][j] + sn * u[i][k - 1];
- u[i][k - 1] = -sn * u[i][j] + cs * u[i][k - 1];
- u[i][j] = t;
- }
- }
- }
- }
- break;
-
- // Perform one qr step.
-
- case 3: {
-
- // Calculate the shift.
-
- double scale = Math.max(Math.max(Math.max(Math.max(
- Math.abs(s[p - 1]), Math.abs(s[p - 2])), Math.abs(e[p - 2])),
- Math.abs(s[k])), Math.abs(e[k]));
- double sp = s[p - 1] / scale;
- double spm1 = s[p - 2] / scale;
- double epm1 = e[p - 2] / scale;
- double sk = s[k] / scale;
- double ek = e[k] / scale;
- double b = ((spm1 + sp) * (spm1 - sp) + epm1 * epm1) / 2.0;
- double c = sp * epm1 * sp * epm1;
- double shift = 0.0;
- if (b != 0.0 || c != 0.0) {
- shift = Math.sqrt(b * b + c);
- if (b < 0.0) {
- shift = -shift;
- }
- shift = c / (b + shift);
- }
- double f = (sk + sp) * (sk - sp) + shift;
- double g = sk * ek;
-
- // Chase zeros.
-
- for (int j = k; j < p - 1; j++) {
- double t = Algebra.hypot(f, g);
- double cs = f / t;
- double sn = g / t;
- if (j != k) {
- e[j - 1] = t;
- }
- f = cs * s[j] + sn * e[j];
- e[j] = cs * e[j] - sn * s[j];
- g = sn * s[j + 1];
- s[j + 1] = cs * s[j + 1];
- if (wantv) {
- for (int i = 0; i < n; i++) {
- t = cs * v[i][j] + sn * v[i][j + 1];
- v[i][j + 1] = -sn * v[i][j] + cs * v[i][j + 1];
- v[i][j] = t;
- }
- }
- t = Algebra.hypot(f, g);
- cs = f / t;
- sn = g / t;
- s[j] = t;
- f = cs * e[j] + sn * s[j + 1];
- s[j + 1] = -sn * e[j] + cs * s[j + 1];
- g = sn * e[j + 1];
- e[j + 1] = cs * e[j + 1];
- if (wantu && j < m - 1) {
- for (int i = 0; i < m; i++) {
- t = cs * u[i][j] + sn * u[i][j + 1];
- u[i][j + 1] = -sn * u[i][j] + cs * u[i][j + 1];
- u[i][j] = t;
- }
- }
- }
- e[p - 2] = f;
- iter = iter + 1;
- }
- break;
-
- // Convergence.
-
- case 4: {
-
- // Make the singular values positive.
-
- if (s[k] <= 0.0) {
- s[k] = s[k] < 0.0 ? -s[k] : 0.0;
- if (wantv) {
- for (int i = 0; i <= pp; i++) {
- v[i][k] = -v[i][k];
- }
- }
- }
-
- // Order the singular values.
-
- while (k < pp) {
- if (s[k] >= s[k + 1]) {
- break;
- }
- double t = s[k];
- s[k] = s[k + 1];
- s[k + 1] = t;
- if (wantv && k < n - 1) {
- for (int i = 0; i < n; i++) {
- t = v[i][k + 1];
- v[i][k + 1] = v[i][k];
- v[i][k] = t;
- }
- }
- if (wantu && k < m - 1) {
- for (int i = 0; i < m; i++) {
- t = u[i][k + 1];
- u[i][k + 1] = u[i][k];
- u[i][k] = t;
- }
- }
- k++;
- }
- iter = 0;
- p--;
- }
- break;
- default:
- throw new IllegalStateException();
- }
- }
- }
-
- /**
- * Returns the two norm condition number, which is <tt>max(S) / min(S)</tt>.
- */
- public double cond() {
- return s[0] / s[Math.min(m, n) - 1];
- }
-
- /**
- * @return the diagonal matrix of singular values.
- */
- public Matrix getS() {
- double[][] s = new double[n][n];
- for (int i = 0; i < n; i++) {
- for (int j = 0; j < n; j++) {
- s[i][j] = 0.0;
- }
- s[i][i] = this.s[i];
- }
-
- return new DenseMatrix(s);
- }
-
- /**
- * Returns the diagonal of <tt>S</tt>, which is a one-dimensional array of
- * singular values
- *
- * @return diagonal of <tt>S</tt>.
- */
- public double[] getSingularValues() {
- return s;
- }
-
- /**
- * Returns the left singular vectors <tt>U</tt>.
- *
- * @return <tt>U</tt>
- */
- public Matrix getU() {
- if (transpositionNeeded) { //case numRows() < numCols()
- return new DenseMatrix(v);
- } else {
- int numCols = Math.min(m + 1, n);
- Matrix r = new DenseMatrix(m, numCols);
- for (int i = 0; i < m; i++) {
- for (int j = 0; j < numCols; j++) {
- r.set(i, j, u[i][j]);
- }
- }
-
- return r;
- }
- }
-
- /**
- * Returns the right singular vectors <tt>V</tt>.
- *
- * @return <tt>V</tt>
- */
- public Matrix getV() {
- if (transpositionNeeded) { //case numRows() < numCols()
- int numCols = Math.min(m + 1, n);
- Matrix r = new DenseMatrix(m, numCols);
- for (int i = 0; i < m; i++) {
- for (int j = 0; j < numCols; j++) {
- r.set(i, j, u[i][j]);
- }
- }
-
- return r;
- } else {
- return new DenseMatrix(v);
- }
- }
-
- /** Returns the two norm, which is <tt>max(S)</tt>. */
- public double norm2() {
- return s[0];
- }
-
- /**
- * Returns the effective numerical matrix rank, which is the number of
- * nonnegligible singular values.
- */
- public int rank() {
- double eps = Math.pow(2.0, -52.0);
- double tol = Math.max(m, n) * s[0] * eps;
- int r = 0;
- for (double value : s) {
- if (value > tol) {
- r++;
- }
- }
- return r;
- }
-
- /**
- * @param minSingularValue
- * minSingularValue - value below which singular values are ignored (a 0 or negative
- * value implies all singular value will be used)
- * @return Returns the n × n covariance matrix.
- * The covariance matrix is V × J × Vt where J is the diagonal matrix of the inverse
- * of the squares of the singular values.
- */
- Matrix getCovariance(double minSingularValue) {
- Matrix j = new DenseMatrix(s.length,s.length);
- Matrix vMat = new DenseMatrix(this.v);
- for (int i = 0; i < s.length; i++) {
- j.set(i, i, s[i] >= minSingularValue ? 1 / (s[i] * s[i]) : 0.0);
- }
- return vMat.times(j).times(vMat.transpose());
- }
-
- /**
- * Returns a String with (propertyName, propertyValue) pairs. Useful for
- * debugging or to quickly get the rough picture. For example,
- *
- * <pre>
- * rank : 3
- * trace : 0
- * </pre>
- */
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder();
- buf.append("---------------------------------------------------------------------\n");
- buf.append("SingularValueDecomposition(A) --> cond(A), rank(A), norm2(A), U, S, V\n");
- buf.append("---------------------------------------------------------------------\n");
-
- buf.append("cond = ");
- String unknown = "Illegal operation or error: ";
- try {
- buf.append(String.valueOf(this.cond()));
- } catch (IllegalArgumentException exc) {
- buf.append(unknown).append(exc.getMessage());
- }
-
- buf.append("\nrank = ");
- try {
- buf.append(String.valueOf(this.rank()));
- } catch (IllegalArgumentException exc) {
- buf.append(unknown).append(exc.getMessage());
- }
-
- buf.append("\nnorm2 = ");
- try {
- buf.append(String.valueOf(this.norm2()));
- } catch (IllegalArgumentException exc) {
- buf.append(unknown).append(exc.getMessage());
- }
-
- buf.append("\n\nU = ");
- try {
- buf.append(String.valueOf(this.getU()));
- } catch (IllegalArgumentException exc) {
- buf.append(unknown).append(exc.getMessage());
- }
-
- buf.append("\n\nS = ");
- try {
- buf.append(String.valueOf(this.getS()));
- } catch (IllegalArgumentException exc) {
- buf.append(unknown).append(exc.getMessage());
- }
-
- buf.append("\n\nV = ");
- try {
- buf.append(String.valueOf(this.getV()));
- } catch (IllegalArgumentException exc) {
- buf.append(unknown).append(exc.getMessage());
- }
-
- return buf.toString();
- }
-}

r***@apache.org

2018-06-27 14:51:40 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Centroid.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Centroid.java b/math/src/main/java/org/apache/mahout/math/Centroid.java
deleted file mode 100644
index dceffe1..0000000
--- a/math/src/main/java/org/apache/mahout/math/Centroid.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.function.Functions;
-
-/**
- * A centroid is a weighted vector. We have it delegate to the vector itself for lots of operations
- * to make it easy to use vector search classes and such.
- */
-public class Centroid extends WeightedVector {
- public Centroid(WeightedVector original) {
- super(original.getVector().like().assign(original), original.getWeight(), original.getIndex());
- }
-
- public Centroid(int key, Vector initialValue) {
- super(initialValue, 1, key);
- }
-
- public Centroid(int key, Vector initialValue, double weight) {
- super(initialValue, weight, key);
- }
-
- public static Centroid create(int key, Vector initialValue) {
- if (initialValue instanceof WeightedVector) {
- return new Centroid(key, new DenseVector(initialValue), ((WeightedVector) initialValue).getWeight());
- } else {
- return new Centroid(key, new DenseVector(initialValue), 1);
- }
- }
-
- public void update(Vector v) {
- if (v instanceof Centroid) {
- Centroid c = (Centroid) v;
- update(c.delegate, c.getWeight());
- } else {
- update(v, 1);
- }
- }
-
- public void update(Vector other, final double wy) {
- final double wx = getWeight();
- delegate.assign(other, Functions.reweigh(wx, wy));
- setWeight(wx + wy);
- }
-
- @Override
- public Centroid like() {
- return new Centroid(getIndex(), getVector().like(), getWeight());
- }
-
- /**
- * Gets the index of this centroid. Use getIndex instead to maintain standard names.
- */
- @Deprecated
- public int getKey() {
- return getIndex();
- }
-
- public void addWeight(double newWeight) {
- setWeight(getWeight() + newWeight);
- }
-
- @Override
- public String toString() {
- return String.format("key = %d, weight = %.2f, vector = %s", getIndex(), getWeight(), delegate);
- }
-
- @SuppressWarnings("CloneDoesntCallSuperClone")
- @Override
- public Centroid clone() {
- return new Centroid(this);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/CholeskyDecomposition.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/CholeskyDecomposition.java b/math/src/main/java/org/apache/mahout/math/CholeskyDecomposition.java
deleted file mode 100644
index 5cea8e5..0000000
--- a/math/src/main/java/org/apache/mahout/math/CholeskyDecomposition.java
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.math.function.Functions;
-
-/**
- * Cholesky decomposition shamelessly ported from JAMA.
- * 
- * A Cholesky decomposition of a semi-positive definite matrix A is a lower triangular matrix L such
- * that L L^* = A. If A is full rank, L is unique. If A is real, then it must be symmetric and R
- * will also be real.
- */
-public class CholeskyDecomposition {
- private final PivotedMatrix L;
- private boolean isPositiveDefinite = true;
-
- public CholeskyDecomposition(Matrix a) {
- this(a, true);
- }
-
- public CholeskyDecomposition(Matrix a, boolean pivot) {
- int rows = a.rowSize();
- L = new PivotedMatrix(new DenseMatrix(rows, rows));
-
- // must be square
- Preconditions.checkArgument(rows == a.columnSize(), "Must be a Square Matrix");
-
- if (pivot) {
- decomposeWithPivoting(a);
- } else {
- decompose(a);
- }
- }
-
- private void decomposeWithPivoting(Matrix a) {
- int n = a.rowSize();
- L.assign(a);
-
- // pivoted column-wise submatrix cholesky with simple pivoting
- double uberMax = L.viewDiagonal().aggregate(Functions.MAX, Functions.ABS);
- for (int k = 0; k < n; k++) {
- double max = 0;
- int pivot = k;
- for (int j = k; j < n; j++) {
- if (L.get(j, j) > max) {
- max = L.get(j, j);
- pivot = j;
- if (uberMax < Math.abs(max)) {
- uberMax = Math.abs(max);
- }
- }
- }
- L.swap(k, pivot);
-
- double akk = L.get(k, k);
- double epsilon = 1.0e-10 * Math.max(uberMax, L.viewColumn(k).aggregate(Functions.MAX, Functions.ABS));
-
- if (akk < -epsilon) {
- // can't have decidedly negative element on diagonal
- throw new IllegalArgumentException("Matrix is not positive semi-definite");
- } else if (akk <= epsilon) {
- // degenerate column case. Set all to zero
- L.viewColumn(k).assign(0);
- isPositiveDefinite = false;
-
- // no need to subtract from remaining sub-matrix
- } else {
- // normalize column by diagonal element
- akk = Math.sqrt(Math.max(0, akk));
- L.viewColumn(k).viewPart(k, n - k).assign(Functions.div(akk));
- L.viewColumn(k).viewPart(0, k).assign(0);
-
- // subtract off scaled version of this column to the right
- for (int j = k + 1; j < n; j++) {
- Vector columnJ = L.viewColumn(j).viewPart(k, n - k);
- Vector columnK = L.viewColumn(k).viewPart(k, n - k);
- columnJ.assign(columnK, Functions.minusMult(columnK.get(j - k)));
- }
-
- }
- }
- }
-
- private void decompose(Matrix a) {
- int n = a.rowSize();
- L.assign(a);
-
- // column-wise submatrix cholesky with simple pivoting
- for (int k = 0; k < n; k++) {
-
- double akk = L.get(k, k);
-
- // set upper part of column to 0.
- L.viewColumn(k).viewPart(0, k).assign(0);
-
- double epsilon = 1.0e-10 * L.viewColumn(k).aggregate(Functions.MAX, Functions.ABS);
- if (akk <= epsilon) {
- // degenerate column case. Set diagonal to 1, all others to zero
- L.viewColumn(k).viewPart(k, n - k).assign(0);
-
- isPositiveDefinite = false;
-
- // no need to subtract from remaining sub-matrix
- } else {
- // normalize column by diagonal element
- akk = Math.sqrt(Math.max(0, akk));
- L.set(k, k, akk);
- L.viewColumn(k).viewPart(k + 1, n - k - 1).assign(Functions.div(akk));
-
- // now subtract scaled version of column
- for (int j = k + 1; j < n; j++) {
- Vector columnJ = L.viewColumn(j).viewPart(j, n - j);
- Vector columnK = L.viewColumn(k).viewPart(j, n - j);
- columnJ.assign(columnK, Functions.minusMult(L.get(j, k)));
- }
- }
- }
- }
-
- public boolean isPositiveDefinite() {
- return isPositiveDefinite;
- }
-
- public Matrix getL() {
- return L.getBase();
- }
-
- public PivotedMatrix getPermutedL() {
- return L;
- }
-
- /**
- * @return Returns the permutation of rows and columns that was applied to L
- */
- public int[] getPivot() {
- return L.getRowPivot();
- }
-
- public int[] getInversePivot() {
- return L.getInverseRowPivot();
- }
-
- /**
- * Compute inv(L) * z efficiently.
- *
- * @param z
- */
- public Matrix solveLeft(Matrix z) {
- int n = L.columnSize();
- int nx = z.columnSize();
-
- Matrix X = new DenseMatrix(n, z.columnSize());
- X.assign(z);
-
- // Solve L*Y = Z using back-substitution
- // note that k and i have to go in a funny order because L is pivoted
- for (int internalK = 0; internalK < n; internalK++) {
- int k = L.rowUnpivot(internalK);
- for (int j = 0; j < nx; j++) {
- for (int internalI = 0; internalI < internalK; internalI++) {
- int i = L.rowUnpivot(internalI);
- X.set(k, j, X.get(k, j) - X.get(i, j) * L.get(k, i));
- }
- if (L.get(k, k) != 0) {
- X.set(k, j, X.get(k, j) / L.get(k, k));
- } else {
- X.set(k, j, 0);
- }
- }
- }
- return X;
- }
-
- /**
- * Compute z * inv(L') efficiently
- */
- public Matrix solveRight(Matrix z) {
- int n = z.columnSize();
- int nx = z.rowSize();
-
- Matrix x = new DenseMatrix(z.rowSize(), z.columnSize());
- x.assign(z);
-
- // Solve Y*L' = Z using back-substitution
- for (int internalK = 0; internalK < n; internalK++) {
- int k = L.rowUnpivot(internalK);
- for (int j = 0; j < nx; j++) {
- for (int internalI = 0; internalI < k; internalI++) {
- int i = L.rowUnpivot(internalI);
- x.set(j, k, x.get(j, k) - x.get(j, i) * L.get(k, i));
- if (Double.isInfinite(x.get(j, k)) || Double.isNaN(x.get(j, k))) {
- throw new IllegalStateException(
- String.format("Invalid value found at %d,%d (should not be possible)", j, k));
- }
- }
- if (L.get(k, k) != 0) {
- x.set(j, k, x.get(j, k) / L.get(k, k));
- } else {
- x.set(j, k, 0);
- }
- if (Double.isInfinite(x.get(j, k)) || Double.isNaN(x.get(j, k))) {
- throw new IllegalStateException(String.format("Invalid value found at %d,%d (should not be possible)", j, k));
- }
- }
- }
- return x;
- }
-
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/ConstantVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/ConstantVector.java b/math/src/main/java/org/apache/mahout/math/ConstantVector.java
deleted file mode 100644
index f10f631..0000000
--- a/math/src/main/java/org/apache/mahout/math/ConstantVector.java
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Iterator;
-
-import com.google.common.collect.AbstractIterator;
-
-/**
- * Implements a vector with all the same values.
- */
-public class ConstantVector extends AbstractVector {
- private final double value;
-
- public ConstantVector(double value, int size) {
- super(size);
- this.value = value;
- }
-
- /**
- * Subclasses must override to return an appropriately sparse or dense result
- *
- * @param rows the row cardinality
- * @param columns the column cardinality
- * @return a Matrix
- */
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- return new DenseMatrix(rows, columns);
- }
-
- /**
- * Used internally by assign() to update multiple indices and values at once.
- * Only really useful for sparse vectors (especially SequentialAccessSparseVector).
- * 
- * If someone ever adds a new type of sparse vectors, this method must merge (index, value) pairs into the vector.
- *
- * @param updates a mapping of indices to values to merge in the vector.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- throw new UnsupportedOperationException("Cannot mutate a ConstantVector");
- }
-
- /**
- * @return true iff this implementation should be considered dense -- that it explicitly represents
- * every value
- */
- @Override
- public boolean isDense() {
- return true;
- }
-
- /**
- * @return true iff this implementation should be considered to be iterable in index order in an
- * efficient way. In particular this implies that {@link #iterator()} and {@link
- * #iterateNonZero()} return elements in ascending order by index.
- */
- @Override
- public boolean isSequentialAccess() {
- return true;
- }
-
- /**
- * Iterates over all elements 
- * NOTE: Implementations may choose to reuse the Element returned
- * for performance reasons, so if you need a copy of it, you should call {@link #getElement(int)}
- * for the given index
- *
- * @return An {@link java.util.Iterator} over all elements
- */
- @Override
- public Iterator<Element> iterator() {
- return new AbstractIterator<Element>() {
- private int i = 0;
- private final int n = size();
- @Override
- protected Element computeNext() {
- if (i < n) {
- return new LocalElement(i++);
- } else {
- return endOfData();
- }
- }
- };
- }
-
- /**
- * Iterates over all non-zero elements.
- * NOTE: Implementations may choose to reuse the Element
- * returned for performance reasons, so if you need a copy of it, you should call {@link
- * #getElement(int)} for the given index
- *
- * @return An {@link java.util.Iterator} over all non-zero elements
- */
- @Override
- public Iterator<Element> iterateNonZero() {
- return iterator();
- }
-
- /**
- * Return the value at the given index, without checking bounds
- *
- * @param index an int index
- * @return the double at the index
- */
- @Override
- public double getQuick(int index) {
- return value;
- }
-
- /**
- * Return an empty vector of the same underlying class as the receiver
- *
- * @return a Vector
- */
- @Override
- public Vector like() {
- return new DenseVector(size());
- }
-
- @Override
- public Vector like(int cardinality) {
- return new DenseVector(cardinality);
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param index an int index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int index, double value) {
- throw new UnsupportedOperationException("Can't set a value in a constant matrix");
- }
-
- /**
- * Return the number of values in the recipient
- *
- * @return an int
- */
- @Override
- public int getNumNondefaultElements() {
- return size();
- }
-
- @Override
- public double getLookupCost() {
- return 1;
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return 1;
- }
-
- @Override
- public boolean isAddConstantTime() {
- throw new UnsupportedOperationException("Cannot mutate a ConstantVector");
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DelegatingVector.java b/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
deleted file mode 100644
index 0b2e36b..0000000
--- a/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.DoubleFunction;
-
-/**
- * A delegating vector provides an easy way to decorate vectors with weights or id's and such while
- * keeping all of the Vector functionality.
- *
- * This vector implements LengthCachingVector because almost all delegates cache the length and
- * the cost of false positives is very low.
- */
-public class DelegatingVector implements Vector, LengthCachingVector {
- protected Vector delegate;
-
- public DelegatingVector(Vector v) {
- delegate = v;
- }
-
- protected DelegatingVector() {
- }
-
- public Vector getVector() {
- return delegate;
- }
-
- @Override
- public double aggregate(DoubleDoubleFunction aggregator, DoubleFunction map) {
- return delegate.aggregate(aggregator, map);
- }
-
- @Override
- public double aggregate(Vector other, DoubleDoubleFunction aggregator, DoubleDoubleFunction combiner) {
- return delegate.aggregate(other, aggregator, combiner);
- }
-
- @Override
- public Vector viewPart(int offset, int length) {
- return delegate.viewPart(offset, length);
- }
-
- @SuppressWarnings("CloneDoesntDeclareCloneNotSupportedException")
- @Override
- public Vector clone() {
- DelegatingVector r;
- try {
- r = (DelegatingVector) super.clone();
- } catch (CloneNotSupportedException e) {
- throw new RuntimeException("Clone not supported for DelegatingVector, shouldn't be possible");
- }
- // delegate points to original without this
- r.delegate = delegate.clone();
- return r;
- }
-
- @Override
- public Iterable<Element> all() {
- return delegate.all();
- }
-
- @Override
- public Iterable<Element> nonZeroes() {
- return delegate.nonZeroes();
- }
-
- @Override
- public Vector divide(double x) {
- return delegate.divide(x);
- }
-
- @Override
- public double dot(Vector x) {
- return delegate.dot(x);
- }
-
- @Override
- public double get(int index) {
- return delegate.get(index);
- }
-
- @Override
- public Element getElement(int index) {
- return delegate.getElement(index);
- }
-
- /**
- * Merge a set of (index, value) pairs into the vector.
- *
- * @param updates an ordered mapping of indices to values to be merged in.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- delegate.mergeUpdates(updates);
- }
-
- @Override
- public Vector minus(Vector that) {
- return delegate.minus(that);
- }
-
- @Override
- public Vector normalize() {
- return delegate.normalize();
- }
-
- @Override
- public Vector normalize(double power) {
- return delegate.normalize(power);
- }
-
- @Override
- public Vector logNormalize() {
- return delegate.logNormalize();
- }
-
- @Override
- public Vector logNormalize(double power) {
- return delegate.logNormalize(power);
- }
-
- @Override
- public double norm(double power) {
- return delegate.norm(power);
- }
-
- @Override
- public double getLengthSquared() {
- return delegate.getLengthSquared();
- }
-
- @Override
- public void invalidateCachedLength() {
- if (delegate instanceof LengthCachingVector) {
- ((LengthCachingVector) delegate).invalidateCachedLength();
- }
- }
-
- @Override
- public double getDistanceSquared(Vector v) {
- return delegate.getDistanceSquared(v);
- }
-
- @Override
- public double getLookupCost() {
- return delegate.getLookupCost();
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return delegate.getIteratorAdvanceCost();
- }
-
- @Override
- public boolean isAddConstantTime() {
- return delegate.isAddConstantTime();
- }
-
- @Override
- public double maxValue() {
- return delegate.maxValue();
- }
-
- @Override
- public int maxValueIndex() {
- return delegate.maxValueIndex();
- }
-
- @Override
- public double minValue() {
- return delegate.minValue();
- }
-
- @Override
- public int minValueIndex() {
- return delegate.minValueIndex();
- }
-
- @Override
- public Vector plus(double x) {
- return delegate.plus(x);
- }
-
- @Override
- public Vector plus(Vector x) {
- return delegate.plus(x);
- }
-
- @Override
- public void set(int index, double value) {
- delegate.set(index, value);
- }
-
- @Override
- public Vector times(double x) {
- return delegate.times(x);
- }
-
- @Override
- public Vector times(Vector x) {
- return delegate.times(x);
- }
-
- @Override
- public double zSum() {
- return delegate.zSum();
- }
-
- @Override
- public Vector assign(double value) {
- delegate.assign(value);
- return this;
- }
-
- @Override
- public Vector assign(double[] values) {
- delegate.assign(values);
- return this;
- }
-
- @Override
- public Vector assign(Vector other) {
- delegate.assign(other);
- return this;
- }
-
- @Override
- public Vector assign(DoubleDoubleFunction f, double y) {
- delegate.assign(f, y);
- return this;
- }
-
- @Override
- public Vector assign(DoubleFunction function) {
- delegate.assign(function);
- return this;
- }
-
- @Override
- public Vector assign(Vector other, DoubleDoubleFunction function) {
- delegate.assign(other, function);
- return this;
- }
-
- @Override
- public Matrix cross(Vector other) {
- return delegate.cross(other);
- }
-
- @Override
- public int size() {
- return delegate.size();
- }
-
- @Override
- public String asFormatString() {
- return delegate.asFormatString();
- }
-
- @Override
- public int hashCode() {
- return delegate.hashCode();
- }
-
- @SuppressWarnings("EqualsWhichDoesntCheckParameterClass")
- @Override
- public boolean equals(Object o) {
- return delegate.equals(o);
- }
-
- @Override
- public String toString() {
- return delegate.toString();
- }
-
- @Override
- public boolean isDense() {
- return delegate.isDense();
- }
-
- @Override
- public boolean isSequentialAccess() {
- return delegate.isSequentialAccess();
- }
-
- @Override
- public double getQuick(int index) {
- return delegate.getQuick(index);
- }
-
- @Override
- public Vector like() {
- return new DelegatingVector(delegate.like());
- }
-
- @Override
- public Vector like(int cardinality) {
- return new DelegatingVector(delegate.like(cardinality));
- }
-
- @Override
- public void setQuick(int index, double value) {
- delegate.setQuick(index, value);
- }
-
- @Override
- public void incrementQuick(int index, double increment) {
- delegate.incrementQuick(index, increment);
- }
-
- @Override
- public int getNumNondefaultElements() {
- return delegate.getNumNondefaultElements();
- }
-
- @Override
- public int getNumNonZeroElements() {
- return delegate.getNumNonZeroElements();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DenseMatrix.java b/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
deleted file mode 100644
index eac449a..0000000
--- a/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.MatrixFlavor;
-
-import java.util.Arrays;
-
-/** Matrix of doubles implemented using a 2-d array */
-public class DenseMatrix extends AbstractMatrix {
-
- private double[][] values;
-
- /**
- * Construct a matrix from the given values
- *
- * @param values
- * a double[][]
- */
- public DenseMatrix(double[][] values) {
- this(values, false);
- }
-
- /**
- * Construct a matrix from the given values
- *
- * @param values
- * a double[][]
- * @param shallowCopy directly use the supplied array?
- */
- public DenseMatrix(double[][] values, boolean shallowCopy) {
- super(values.length, values[0].length);
- if (shallowCopy) {
- this.values = values;
- } else {
- this.values = new double[values.length][];
- for (int i = 0; i < values.length; i++) {
- this.values[i] = values[i].clone();
- }
- }
- }
-
- /**
- * Constructs an empty matrix of the given size.
- * @param rows The number of rows in the result.
- * @param columns The number of columns in the result.
- */
- public DenseMatrix(int rows, int columns) {
- super(rows, columns);
- this.values = new double[rows][columns];
- }
-
- /**
- * Returns the backing array
- * @return double[][]
- */
- public double[][] getBackingStructure() {
- return this.values;
- }
-
- @Override
- public Matrix clone() {
- DenseMatrix clone = (DenseMatrix) super.clone();
- clone.values = new double[values.length][];
- for (int i = 0; i < values.length; i++) {
- clone.values[i] = values[i].clone();
- }
- return clone;
- }
-
- @Override
- public double getQuick(int row, int column) {
- return values[row][column];
- }
-
- @Override
- public Matrix like() {
- return like(rowSize(), columnSize());
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- return new DenseMatrix(rows, columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- values[row][column] = value;
- }
-
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- int rowOffset = offset[ROW];
- int rowsRequested = size[ROW];
- int columnOffset = offset[COL];
- int columnsRequested = size[COL];
-
- return viewPart(rowOffset, rowsRequested, columnOffset, columnsRequested);
- }
-
- @Override
- public Matrix viewPart(int rowOffset, int rowsRequested, int columnOffset, int columnsRequested) {
- if (rowOffset < 0) {
- throw new IndexException(rowOffset, rowSize());
- }
- if (rowOffset + rowsRequested > rowSize()) {
- throw new IndexException(rowOffset + rowsRequested, rowSize());
- }
- if (columnOffset < 0) {
- throw new IndexException(columnOffset, columnSize());
- }
- if (columnOffset + columnsRequested > columnSize()) {
- throw new IndexException(columnOffset + columnsRequested, columnSize());
- }
- return new MatrixView(this, new int[]{rowOffset, columnOffset}, new int[]{rowsRequested, columnsRequested});
- }
-
- @Override
- public Matrix assign(double value) {
- for (int row = 0; row < rowSize(); row++) {
- Arrays.fill(values[row], value);
- }
- return this;
- }
-
- public Matrix assign(DenseMatrix matrix) {
- // make sure the data field has the correct length
- if (matrix.values[0].length != this.values[0].length || matrix.values.length != this.values.length) {
- this.values = new double[matrix.values.length][matrix.values[0].length];
- }
- // now copy the values
- for (int i = 0; i < this.values.length; i++) {
- System.arraycopy(matrix.values[i], 0, this.values[i], 0, this.values[0].length);
- }
- return this;
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- if (rowSize() != other.size()) {
- throw new CardinalityException(rowSize(), other.size());
- }
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- for (int row = 0; row < rowSize(); row++) {
- values[row][column] = other.getQuick(row);
- }
- return this;
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- if (columnSize() != other.size()) {
- throw new CardinalityException(columnSize(), other.size());
- }
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- for (int col = 0; col < columnSize(); col++) {
- values[row][col] = other.getQuick(col);
- }
- return this;
- }
-
- @Override
- public Vector viewRow(int row) {
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- return new DenseVector(values[row], true);
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- return MatrixFlavor.DENSELIKE;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java b/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
deleted file mode 100644
index 7252b9b..0000000
--- a/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.TraversingStructureEnum;
-
-/**
- * Economy packaging for a dense symmetric in-core matrix.
- */
-public class DenseSymmetricMatrix extends UpperTriangular {
- public DenseSymmetricMatrix(int n) {
- super(n);
- }
-
- public DenseSymmetricMatrix(double[] data, boolean shallow) {
- super(data, shallow);
- }
-
- public DenseSymmetricMatrix(Vector data) {
- super(data);
- }
-
- public DenseSymmetricMatrix(UpperTriangular mx) {
- super(mx);
- }
-
- @Override
- public double getQuick(int row, int column) {
- if (column < row) {
- int swap = row;
- row = column;
- column = swap;
- }
- return super.getQuick(row, column);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- if (column < row) {
- int swap = row;
- row = column;
- column = swap;
- }
- super.setQuick(row, column, value);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/DenseVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DenseVector.java b/math/src/main/java/org/apache/mahout/math/DenseVector.java
deleted file mode 100644
index 3961966..0000000
--- a/math/src/main/java/org/apache/mahout/math/DenseVector.java
+++ /dev/null
@@ -1,442 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.NoSuchElementException;
-
-import com.google.common.base.Preconditions;
-
-/** Implements vector as an array of doubles */
-public class DenseVector extends AbstractVector {
-
- private double[] values;
-
- /** For serialization purposes only */
- public DenseVector() {
- super(0);
- }
-
- /** Construct a new instance using provided values
- * @param values - array of values
- */
- public DenseVector(double[] values) {
- this(values, false);
- }
-
- public DenseVector(double[] values, boolean shallowCopy) {
- super(values.length);
- this.values = shallowCopy ? values : values.clone();
- }
-
- public DenseVector(DenseVector values, boolean shallowCopy) {
- this(values.values, shallowCopy);
- }
-
- /** Construct a new instance of the given cardinality
- * @param cardinality - number of values in the vector
- */
- public DenseVector(int cardinality) {
- super(cardinality);
- this.values = new double[cardinality];
- }
-
- /**
- * Copy-constructor (for use in turning a sparse vector into a dense one, for example)
- * @param vector The vector to copy
- */
- public DenseVector(Vector vector) {
- super(vector.size());
- values = new double[vector.size()];
- for (Element e : vector.nonZeroes()) {
- values[e.index()] = e.get();
- }
- }
-
- @Override
- public double dot(Vector x) {
- if (!x.isDense()) {
- return super.dot(x);
- } else {
-
- int size = x.size();
- if (values.length != size) {
- throw new CardinalityException(values.length, size);
- }
-
- double sum = 0;
- for (int n = 0; n < size; n++) {
- sum += values[n] * x.getQuick(n);
- }
- return sum;
- }
- }
-
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- return new DenseMatrix(rows, columns);
- }
-
- @SuppressWarnings("CloneDoesntCallSuperClone")
- @Override
- public DenseVector clone() {
- return new DenseVector(values.clone());
- }
-
- /**
- * @return true
- */
- @Override
- public boolean isDense() {
- return true;
- }
-
- /**
- * @return true
- */
- @Override
- public boolean isSequentialAccess() {
- return true;
- }
-
- @Override
- protected double dotSelf() {
- double result = 0.0;
- int max = size();
- for (int i = 0; i < max; i++) {
- result += values[i] * values[i];
- }
- return result;
- }
-
- @Override
- public double getQuick(int index) {
- return values[index];
- }
-
- @Override
- public DenseVector like() {
- return new DenseVector(size());
- }
-
- @Override
- public Vector like(int cardinality) {
- return new DenseVector(cardinality);
- }
-
- @Override
- public void setQuick(int index, double value) {
- invalidateCachedLength();
- values[index] = value;
- }
-
- @Override
- public void incrementQuick(int index, double increment) {
- invalidateCachedLength();
- values[index] += increment;
- }
-
- @Override
- public Vector assign(double value) {
- invalidateCachedLength();
- Arrays.fill(values, value);
- return this;
- }
-
- @Override
- public int getNumNondefaultElements() {
- return values.length;
- }
-
- @Override
- public int getNumNonZeroElements() {
- int numNonZeros = 0;
- for (int index = 0; index < values.length; index++) {
- if (values[index] != 0) {
- numNonZeros++;
- }
- }
- return numNonZeros;
- }
-
- public Vector assign(DenseVector vector) {
- // make sure the data field has the correct length
- if (vector.values.length != this.values.length) {
- this.values = new double[vector.values.length];
- }
- // now copy the values
- System.arraycopy(vector.values, 0, this.values, 0, this.values.length);
- return this;
- }
-
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- int numUpdates = updates.getNumMappings();
- int[] indices = updates.getIndices();
- double[] values = updates.getValues();
- for (int i = 0; i < numUpdates; ++i) {
- this.values[indices[i]] = values[i];
- }
- }
-
- @Override
- public Vector viewPart(int offset, int length) {
- if (offset < 0) {
- throw new IndexException(offset, size());
- }
- if (offset + length > size()) {
- throw new IndexException(offset + length, size());
- }
- return new DenseVectorView(this, offset, length);
- }
-
- @Override
- public double getLookupCost() {
- return 1;
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return 1;
- }
-
- @Override
- public boolean isAddConstantTime() {
- return true;
- }
-
- /**
- * Returns an iterator that traverses this Vector from 0 to cardinality-1, in that order.
- */
- @Override
- public Iterator<Element> iterateNonZero() {
- return new NonDefaultIterator();
- }
-
- @Override
- public Iterator<Element> iterator() {
- return new AllIterator();
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof DenseVector) {
- // Speedup for DenseVectors
- return Arrays.equals(values, ((DenseVector) o).values);
- }
- return super.equals(o);
- }
-
- public void addAll(Vector v) {
- if (size() != v.size()) {
- throw new CardinalityException(size(), v.size());
- }
-
- for (Element element : v.nonZeroes()) {
- values[element.index()] += element.get();
- }
- }
-
- private final class NonDefaultIterator implements Iterator<Element> {
- private final DenseElement element = new DenseElement();
- private int index = -1;
- private int lookAheadIndex = -1;
-
- @Override
- public boolean hasNext() {
- if (lookAheadIndex == index) { // User calls hasNext() after a next()
- lookAhead();
- } // else user called hasNext() repeatedly.
- return lookAheadIndex < size();
- }
-
- private void lookAhead() {
- lookAheadIndex++;
- while (lookAheadIndex < size() && values[lookAheadIndex] == 0.0) {
- lookAheadIndex++;
- }
- }
-
- @Override
- public Element next() {
- if (lookAheadIndex == index) { // If user called next() without checking hasNext().
- lookAhead();
- }
-
- Preconditions.checkState(lookAheadIndex > index);
- index = lookAheadIndex;
-
- if (index >= size()) { // If the end is reached.
- throw new NoSuchElementException();
- }
-
- element.index = index;
- return element;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- private final class AllIterator implements Iterator<Element> {
- private final DenseElement element = new DenseElement();
-
- private AllIterator() {
- element.index = -1;
- }
-
- @Override
- public boolean hasNext() {
- return element.index + 1 < size();
- }
-
- @Override
- public Element next() {
- if (element.index + 1 >= size()) { // If the end is reached.
- throw new NoSuchElementException();
- }
- element.index++;
- return element;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- private final class DenseElement implements Element {
- int index;
-
- @Override
- public double get() {
- return values[index];
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set(double value) {
- invalidateCachedLength();
- values[index] = value;
- }
- }
-
- private final class DenseVectorView extends VectorView {
-
- public DenseVectorView(Vector vector, int offset, int cardinality) {
- super(vector, offset, cardinality);
- }
-
- @Override
- public double dot(Vector x) {
-
- // Apply custom dot kernels for pairs of dense vectors or their views to reduce
- // view indirection.
- if (x instanceof DenseVectorView) {
-
- if (size() != x.size())
- throw new IllegalArgumentException("Cardinality mismatch during dot(x,y).");
-
- DenseVectorView xv = (DenseVectorView) x;
- double[] thisValues = ((DenseVector) vector).values;
- double[] thatValues = ((DenseVector) xv.vector).values;
- int untilOffset = offset + size();
-
- int i, j;
- double sum = 0.0;
-
- // Provoking SSE
- int until4 = offset + (size() & ~3);
- for (
- i = offset, j = xv.offset;
- i < until4;
- i += 4, j += 4
- ) {
- sum += thisValues[i] * thatValues[j] +
- thisValues[i + 1] * thatValues[j + 1] +
- thisValues[i + 2] * thatValues[j + 2] +
- thisValues[i + 3] * thatValues[j + 3];
- }
-
- // Picking up the slack
- for (
- i = offset, j = xv.offset;
- i < untilOffset;
- ) {
- sum += thisValues[i++] * thatValues[j++];
- }
- return sum;
-
- } else if (x instanceof DenseVector ) {
-
- if (size() != x.size())
- throw new IllegalArgumentException("Cardinality mismatch during dot(x,y).");
-
- DenseVector xv = (DenseVector) x;
- double[] thisValues = ((DenseVector) vector).values;
- double[] thatValues = xv.values;
- int untilOffset = offset + size();
-
- int i, j;
- double sum = 0.0;
-
- // Provoking SSE
- int until4 = offset + (size() & ~3);
- for (
- i = offset, j = 0;
- i < until4;
- i += 4, j += 4
- ) {
- sum += thisValues[i] * thatValues[j] +
- thisValues[i + 1] * thatValues[j + 1] +
- thisValues[i + 2] * thatValues[j + 2] +
- thisValues[i + 3] * thatValues[j + 3];
- }
-
- // Picking up slack
- for ( ;
- i < untilOffset;
- ) {
- sum += thisValues[i++] * thatValues[j++];
- }
- return sum;
-
- } else {
- return super.dot(x);
- }
- }
-
- @Override
- public Vector viewPart(int offset, int length) {
- if (offset < 0) {
- throw new IndexException(offset, size());
- }
- if (offset + length > size()) {
- throw new IndexException(offset + length, size());
- }
- return new DenseVectorView(vector, offset + this.offset, length);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java b/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
deleted file mode 100644
index 070fad2..0000000
--- a/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.flavor.TraversingStructureEnum;
-
-import java.util.Iterator;
-import java.util.NoSuchElementException;
-
-public class DiagonalMatrix extends AbstractMatrix implements MatrixTimesOps {
- private final Vector diagonal;
-
- public DiagonalMatrix(Vector values) {
- super(values.size(), values.size());
- this.diagonal = values;
- }
-
- public DiagonalMatrix(Matrix values) {
- this(values.viewDiagonal());
- }
-
- public DiagonalMatrix(double value, int size) {
- this(new ConstantVector(value, size));
- }
-
- public DiagonalMatrix(double[] values) {
- super(values.length, values.length);
- this.diagonal = new DenseVector(values);
- }
-
- public static DiagonalMatrix identity(int size) {
- return new DiagonalMatrix(1, size);
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- throw new UnsupportedOperationException("Can't assign a column to a diagonal matrix");
- }
-
- /**
- * Assign the other vector values to the row of the receiver
- *
- * @param row the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- @Override
- public Matrix assignRow(int row, Vector other) {
- throw new UnsupportedOperationException("Can't assign a row to a diagonal matrix");
- }
-
- @Override
- public Vector viewRow(int row) {
- return new SingleElementVector(row);
- }
-
- @Override
- public Vector viewColumn(int row) {
- return new SingleElementVector(row);
- }
-
- /**
- * Special class to implement views of rows and columns of a diagonal matrix.
- */
- public class SingleElementVector extends AbstractVector {
- private int index;
-
- public SingleElementVector(int index) {
- super(diagonal.size());
- this.index = index;
- }
-
- @Override
- public double getQuick(int index) {
- if (index == this.index) {
- return diagonal.get(index);
- } else {
- return 0;
- }
- }
-
- @Override
- public void set(int index, double value) {
- if (index == this.index) {
- diagonal.set(index, value);
- } else {
- throw new IllegalArgumentException("Can't set off-diagonal element of diagonal matrix");
- }
- }
-
- @Override
- protected Iterator<Element> iterateNonZero() {
- return new Iterator<Element>() {
- boolean more = true;
-
- @Override
- public boolean hasNext() {
- return more;
- }
-
- @Override
- public Element next() {
- if (more) {
- more = false;
- return new Element() {
- @Override
- public double get() {
- return diagonal.get(index);
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set(double value) {
- diagonal.set(index, value);
- }
- };
- } else {
- throw new NoSuchElementException("Only one non-zero element in a row or column of a diagonal matrix");
- }
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException("Can't remove from vector view");
- }
- };
- }
-
- @Override
- protected Iterator<Element> iterator() {
- return new Iterator<Element>() {
- int i = 0;
-
- Element r = new Element() {
- @Override
- public double get() {
- if (i == index) {
- return diagonal.get(index);
- } else {
- return 0;
- }
- }
-
- @Override
- public int index() {
- return i;
- }
-
- @Override
- public void set(double value) {
- if (i == index) {
- diagonal.set(index, value);
- } else {
- throw new IllegalArgumentException("Can't set any element but diagonal");
- }
- }
- };
-
- @Override
- public boolean hasNext() {
- return i < diagonal.size() - 1;
- }
-
- @Override
- public Element next() {
- if (i < SingleElementVector.this.size() - 1) {
- i++;
- return r;
- } else {
- throw new NoSuchElementException("Attempted to access passed last element of vector");
- }
- }
-
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException("Default operation");
- }
- };
- }
-
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- return new DiagonalMatrix(rows, columns);
- }
-
- @Override
- public boolean isDense() {
- return false;
- }
-
- @Override
- public boolean isSequentialAccess() {
- return true;
- }
-
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- @Override
- public Vector like() {
- return new DenseVector(size());
- }
-
- @Override
- public Vector like(int cardinality) {
- return new DenseVector(cardinality);
- }
-
- @Override
- public void setQuick(int index, double value) {
- if (index == this.index) {
- diagonal.set(this.index, value);
- } else {
- throw new IllegalArgumentException("Can't set off-diagonal element of DiagonalMatrix");
- }
- }
-
- @Override
- public int getNumNondefaultElements() {
- return 1;
- }
-
- @Override
- public double getLookupCost() {
- return 0;
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return 1;
- }
-
- @Override
- public boolean isAddConstantTime() {
- return false;
- }
- }
-
- /**
- * Provides a view of the diagonal of a matrix.
- */
- @Override
- public Vector viewDiagonal() {
- return this.diagonal;
- }
-
- /**
- * Return the value at the given location, without checking bounds
- *
- * @param row an int row index
- * @param column an int column index
- * @return the double at the index
- */
- @Override
- public double getQuick(int row, int column) {
- if (row == column) {
- return diagonal.get(row);
- } else {
- return 0;
- }
- }
-
- /**
- * Return an empty matrix of the same underlying class as the receiver
- *
- * @return a Matrix
- */
- @Override
- public Matrix like() {
- return new SparseRowMatrix(rowSize(), columnSize());
- }
-
- /**
- * Returns an empty matrix of the same underlying class as the receiver and of the specified
- * size.
- *
- * @param rows the int number of rows
- * @param columns the int number of columns
- */
- @Override
- public Matrix like(int rows, int columns) {
- return new SparseRowMatrix(rows, columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- if (row == column) {
- diagonal.set(row, value);
- } else {
- throw new UnsupportedOperationException("Can't set off-diagonal element");
- }
- }
-
- /**
- * Return the number of values in the recipient
- *
- * @return an int[2] containing [row, column] count
- */
- @Override
- public int[] getNumNondefaultElements() {
- throw new UnsupportedOperationException("Don't understand how to implement this");
- }
-
- /**
- * Return a new matrix containing the subset of the recipient
- *
- * @param offset an int[2] offset into the receiver
- * @param size the int[2] size of the desired result
- * @return a new Matrix that is a view of the original
- * @throws CardinalityException if the length is greater than the cardinality of the receiver
- * @throws IndexException if the offset is negative or the offset+length is outside of the
- * receiver
- */
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- return new MatrixView(this, offset, size);
- }
-
- @Override
- public Matrix times(Matrix other) {
- return timesRight(other);
- }
-
- @Override
- public Matrix timesRight(Matrix that) {
- if (that.numRows() != diagonal.size()) {
- throw new IllegalArgumentException("Incompatible number of rows in the right operand of matrix multiplication.");
- }
- Matrix m = that.like();
- for (int row = 0; row < diagonal.size(); row++) {
- m.assignRow(row, that.viewRow(row).times(diagonal.getQuick(row)));
- }
- return m;
- }
-
- @Override
- public Matrix timesLeft(Matrix that) {
- if (that.numCols() != diagonal.size()) {
- throw new IllegalArgumentException(
- "Incompatible number of rows in the left operand of matrix-matrix multiplication.");
- }
- Matrix m = that.like();
- for (int col = 0; col < diagonal.size(); col++) {
- m.assignColumn(col, that.viewColumn(col).times(diagonal.getQuick(col)));
- }
- return m;
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- return MatrixFlavor.DIAGONALLIKE;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/FileBasedMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/FileBasedMatrix.java b/math/src/main/java/org/apache/mahout/math/FileBasedMatrix.java
deleted file mode 100644
index 3a19318..0000000
--- a/math/src/main/java/org/apache/mahout/math/FileBasedMatrix.java
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.DoubleBuffer;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.List;
-
-/**
- * Provides a way to get data from a file and treat it as if it were a matrix, but avoids putting all that
- * data onto the Java heap. Instead, the file is mapped into non-heap memory as a DoubleBuffer and we access
- * that instead.
- */
-public final class FileBasedMatrix extends AbstractMatrix {
- private final int rowsPerBlock;
- private final List<DoubleBuffer> content = Lists.newArrayList();
-
- /**
- * Constructs an empty matrix of the given size.
- *
- * @param rows The number of rows in the result.
- * @param columns The number of columns in the result.
- */
- public FileBasedMatrix(int rows, int columns) {
- super(rows, columns);
- long maxRows = ((1L << 31) - 1) / (columns * 8);
- if (rows > maxRows) {
- rowsPerBlock = (int) maxRows;
- } else {
- rowsPerBlock = rows;
- }
- }
-
- private void addData(DoubleBuffer content) {
- this.content.add(content);
- }
-
- public void setData(File f, boolean loadNow) throws IOException {
- Preconditions.checkArgument(f.length() == rows * columns * 8L, "File " + f + " is wrong length");
-
- for (int i = 0; i < (rows + rowsPerBlock - 1) / rowsPerBlock; i++) {
- long start = i * rowsPerBlock * columns * 8L;
- long size = rowsPerBlock * columns * 8L;
- MappedByteBuffer buf = new FileInputStream(f).getChannel().map(FileChannel.MapMode.READ_ONLY, start,
- Math.min(f.length() - start, size));
- if (loadNow) {
- buf.load();
- }
- addData(buf.asDoubleBuffer());
- }
- }
-
- public static void writeMatrix(File f, Matrix m) throws IOException {
- Preconditions.checkArgument(f.canWrite(), "Can't write to output file");
- FileOutputStream fos = new FileOutputStream(f);
- try {
- ByteBuffer buf = ByteBuffer.allocate(m.columnSize() * 8);
- for (MatrixSlice row : m) {
- buf.clear();
- for (Vector.Element element : row.vector().all()) {
- buf.putDouble(element.get());
- }
- buf.flip();
- fos.write(buf.array());
- }
- } finally {
- fos.close();
- }
- }
-
- /**
- * Assign the other vector values to the column of the receiver
- *
- * @param column the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws org.apache.mahout.math.CardinalityException
- * if the cardinalities differ
- */
- @Override
- public Matrix assignColumn(int column, Vector other) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Assign the other vector values to the row of the receiver
- *
- * @param row the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws org.apache.mahout.math.CardinalityException
- * if the cardinalities differ
- */
- @Override
- public Matrix assignRow(int row, Vector other) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Return the value at the given indexes, without checking bounds
- *
- * @param row an int row index
- * @param column an int column index
- * @return the double at the index
- */
- @Override
- public double getQuick(int row, int column) {
- int block = row / rowsPerBlock;
- return content.get(block).get((row % rowsPerBlock) * columns + column);
- }
-
- /**
- * Return an empty matrix of the same underlying class as the receiver
- *
- * @return a Matrix
- */
- @Override
- public Matrix like() {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Returns an empty matrix of the same underlying class as the receiver and of the specified size.
- *
- * @param rows the int number of rows
- * @param columns the int number of columns
- */
- @Override
- public Matrix like(int rows, int columns) {
- return new DenseMatrix(rows, columns);
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param row an int row index into the receiver
- * @param column an int column index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int row, int column, double value) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Return a view into part of a matrix. Changes to the view will change the
- * original matrix.
- *
- * @param offset an int[2] offset into the receiver
- * @param size the int[2] size of the desired result
- * @return a matrix that shares storage with part of the original matrix.
- * @throws org.apache.mahout.math.CardinalityException
- * if the length is greater than the cardinality of the receiver
- * @throws org.apache.mahout.math.IndexException
- * if the offset is negative or the offset+length is outside of the receiver
- */
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- throw new UnsupportedOperationException("Default operation");
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java b/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
deleted file mode 100644
index 0b0c25e..0000000
--- a/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
+++ /dev/null
@@ -1,535 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.IntBuffer;
-import java.nio.channels.FileChannel;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-
-import com.google.common.base.Function;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-
-/**
- * Provides a way to get data from a file and treat it as if it were a matrix, but avoids putting
- * all that data onto the Java heap. Instead, the file is mapped into non-heap memory as a
- * DoubleBuffer and we access that instead. The interesting aspect of this is that the values in
- * the matrix are binary and sparse so we don't need to store the actual data, just the location of
- * non-zero values.
- * 
- * Currently file data is formatted as follows:
- * 
- * <ul> <li>A magic number to indicate the file format.</li> <li>The size of the matrix (max rows
- * and columns possible)</li> <li>Number of non-zeros in each row.</li> <li>A list of non-zero
- * columns for each row. The list starts with a count and then has column numbers</li> </ul>
- * 
- * It would be preferable to use something like protobufs to define the format so that we can use
- * different row formats for different kinds of data. For instance, Golay coding of column numbers
- * or compressed bit vectors might be good representations for some purposes.
- */
-public final class FileBasedSparseBinaryMatrix extends AbstractMatrix {
- private static final int MAGIC_NUMBER_V0 = 0x12d7067d;
-
- private final List<IntBuffer> data = Lists.newArrayList();
- private int[] bufferIndex;
- private int[] rowOffset;
- private int[] rowSize;
-
- /**
- * Constructs an empty matrix of the given size.
- *
- * @param rows The number of rows in the result.
- * @param columns The number of columns in the result.
- */
- public FileBasedSparseBinaryMatrix(int rows, int columns) {
- super(rows, columns);
- }
-
- public void setData(File f) throws IOException {
- List<ByteBuffer> buffers = Lists.newArrayList();
- FileChannel input = new FileInputStream(f).getChannel();
-
- buffers.add(input.map(FileChannel.MapMode.READ_ONLY, 0, Math.min(Integer.MAX_VALUE, f.length())));
- data.add(buffers.get(0).asIntBuffer());
- Preconditions.checkArgument(buffers.get(0).getInt() == MAGIC_NUMBER_V0, "Wrong type of file");
-
- int rows = buffers.get(0).getInt();
- int cols = buffers.get(0).getInt();
- Preconditions.checkArgument(rows == rowSize());
- Preconditions.checkArgument(cols == columnSize());
-
- rowOffset = new int[rows];
- rowSize = new int[rows];
- bufferIndex = new int[rows];
-
- int offset = 12 + 4 * rows;
- for (int i = 0; i < rows; i++) {
- int size = buffers.get(0).getInt();
- int buffer = 0;
- while (buffer < buffers.size()) {
- if (offset + size * 4 <= buffers.get(buffer).limit()) {
- break;
- } else {
- offset -= buffers.get(buffer).capacity();
- }
- }
- if (buffer == buffers.size()) {
- buffers.add(input.map(FileChannel.MapMode.READ_ONLY, 0, Math.min(Integer.MAX_VALUE, f.length() - offset)));
- data.add(buffers.get(buffer).asIntBuffer());
- }
- rowOffset[i] = offset / 4;
- rowSize[i] = size;
- bufferIndex[i] = buffer;
-
-// final SparseBinaryVector v = new SparseBinaryVector(buffers.get(buffer), columns, offset, size);
-// this.rows.add(v);
- offset += size * 4;
- }
- }
-
- public static void writeMatrix(File f, Matrix m) throws IOException {
- Preconditions.checkArgument(f.canWrite(), "Can't write to output file");
- FileOutputStream fos = new FileOutputStream(f);
-
- // write header
- DataOutputStream out = new DataOutputStream(fos);
- out.writeInt(MAGIC_NUMBER_V0);
- out.writeInt(m.rowSize());
- out.writeInt(m.columnSize());
-
- // compute offsets and write row headers
- for (MatrixSlice row : m) {
- int nondefaultElements = row.vector().getNumNondefaultElements();
- out.writeInt(nondefaultElements);
- }
-
- // write rows
- for (MatrixSlice row : m) {
- List<Integer> columns = Lists.newArrayList(Iterables.transform(row.vector().nonZeroes(),
- new Function<Vector.Element, Integer>() {
- @Override
- public Integer apply(Vector.Element element) {
- return element.index();
- }
- }));
- Collections.sort(columns);
-
- for (Integer column : columns) {
- out.writeInt(column);
- }
- }
-
- out.close();
- fos.close();
- }
-
- /**
- * Assign the other vector values to the column of the receiver
- *
- * @param column the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws org.apache.mahout.math.CardinalityException
- * if the cardinalities differ
- */
- @Override
- public Matrix assignColumn(int column, Vector other) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Assign the other vector values to the row of the receiver
- *
- * @param row the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws org.apache.mahout.math.CardinalityException
- * if the cardinalities differ
- */
- @Override
- public Matrix assignRow(int row, Vector other) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Return the value at the given indexes, without checking bounds
- *
- * @param rowIndex an int row index
- * @param columnIndex an int column index
- * @return the double at the index
- */
- @Override
- public double getQuick(int rowIndex, int columnIndex) {
- IntBuffer tmp = data.get(bufferIndex[rowIndex]).asReadOnlyBuffer();
- tmp.position(rowOffset[rowIndex]);
- tmp.limit(rowSize[rowIndex]);
- tmp = tmp.slice();
- return searchForIndex(tmp, columnIndex);
- }
-
- private static double searchForIndex(IntBuffer row, int columnIndex) {
- int high = row.limit();
- if (high == 0) {
- return 0;
- }
- int low = 0;
- while (high > low) {
- int mid = (low + high) / 2;
- if (row.get(mid) < columnIndex) {
- low = mid + 1;
- } else {
- high = mid;
- }
- }
- if (low >= row.limit()) {
- return 0;
- } else if (high == low && row.get(low) == columnIndex) {
- return 1;
- } else {
- return 0;
- }
- }
-
- /**
- * Return an empty matrix of the same underlying class as the receiver
- *
- * @return a Matrix
- */
- @Override
- public Matrix like() {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Returns an empty matrix of the same underlying class as the receiver and of the specified
- * size.
- *
- * @param rows the int number of rows
- * @param columns the int number of columns
- */
- @Override
- public Matrix like(int rows, int columns) {
- return new DenseMatrix(rows, columns);
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param row an int row index into the receiver
- * @param column an int column index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int row, int column, double value) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Return a view into part of a matrix. Changes to the view will change the original matrix.
- *
- * @param offset an int[2] offset into the receiver
- * @param size the int[2] size of the desired result
- * @return a matrix that shares storage with part of the original matrix.
- * @throws org.apache.mahout.math.CardinalityException
- * if the length is greater than the cardinality of the receiver
- * @throws org.apache.mahout.math.IndexException
- * if the offset is negative or the offset+length is outside of the receiver
- */
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Returns a view of a row. Changes to the view will affect the original.
- *
- * @param rowIndex Which row to return.
- * @return A vector that references the desired row.
- */
- @Override
- public Vector viewRow(int rowIndex) {
- IntBuffer tmp = data.get(bufferIndex[rowIndex]).asReadOnlyBuffer();
- tmp.position(rowOffset[rowIndex]);
- tmp.limit(rowOffset[rowIndex] + rowSize[rowIndex]);
- tmp = tmp.slice();
- return new SparseBinaryVector(tmp, columnSize());
- }
-
- private static class SparseBinaryVector extends AbstractVector {
- private final IntBuffer buffer;
- private final int maxIndex;
-
- private SparseBinaryVector(IntBuffer buffer, int maxIndex) {
- super(maxIndex);
- this.buffer = buffer;
- this.maxIndex = maxIndex;
- }
-
- SparseBinaryVector(ByteBuffer row, int maxIndex, int offset, int size) {
- super(maxIndex);
- row = row.asReadOnlyBuffer();
- row.position(offset);
- row.limit(offset + size * 4);
- row = row.slice();
- this.buffer = row.slice().asIntBuffer();
- this.maxIndex = maxIndex;
- }
-
- /**
- * Subclasses must override to return an appropriately sparse or dense result
- *
- * @param rows the row cardinality
- * @param columns the column cardinality
- * @return a Matrix
- */
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- throw new UnsupportedOperationException("Default operation");
- }
-
- /**
- * Used internally by assign() to update multiple indices and values at once.
- * Only really useful for sparse vectors (especially SequentialAccessSparseVector).
- * 
- * If someone ever adds a new type of sparse vectors, this method must merge (index, value) pairs into the vector.
- *
- * @param updates a mapping of indices to values to merge in the vector.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- throw new UnsupportedOperationException("Cannot mutate SparseBinaryVector");
- }
-
- /**
- * @return true iff this implementation should be considered dense -- that it explicitly represents
- * every value
- */
- @Override
- public boolean isDense() {
- return false;
- }
-
- /**
- * @return true iff this implementation should be considered to be iterable in index order in an
- * efficient way. In particular this implies that {@link #iterator()} and {@link
- * #iterateNonZero()} return elements in ascending order by index.
- */
- @Override
- public boolean isSequentialAccess() {
- return true;
- }
-
- /**
- * Iterates over all elements
- *
- * NOTE: Implementations may choose to reuse the Element returned
- * for performance reasons, so if you need a copy of it, you should call {@link #getElement(int)}
- * for the given index
- *
- * @return An {@link java.util.Iterator} over all elements
- */
- @Override
- public Iterator<Element> iterator() {
- return new AbstractIterator<Element>() {
- int i = 0;
-
- @Override
- protected Element computeNext() {
- if (i < maxIndex) {
- return new Element() {
- int index = i++;
- /**
- * @return the value of this vector element.
- */
- @Override
- public double get() {
- return getQuick(index);
- }
-
- /**
- * @return the index of this vector element.
- */
- @Override
- public int index() {
- return index;
- }
-
- /**
- * @param value Set the current element to value.
- */
- @Override
- public void set(double value) {
- throw new UnsupportedOperationException("Default operation");
- }
- };
- } else {
- return endOfData();
- }
- }
- };
- }
-
- /**
- * Iterates over all non-zero elements. NOTE: Implementations may choose to reuse the Element
- * returned for performance reasons, so if you need a copy of it, you should call {@link
- * #getElement(int)} for the given index
- *
- * @return An {@link java.util.Iterator} over all non-zero elements
- */
- @Override
- public Iterator<Element> iterateNonZero() {
- return new AbstractIterator<Element>() {
- int i = 0;
- @Override
- protected Element computeNext() {
- if (i < buffer.limit()) {
- return new BinaryReadOnlyElement(buffer.get(i++));
- } else {
- return endOfData();
- }
- }
- };
- }
-
- /**
- * Return the value at the given index, without checking bounds
- *
- * @param index an int index
- * @return the double at the index
- */
- @Override
- public double getQuick(int index) {
- return searchForIndex(buffer, index);
- }
-
- /**
- * Return an empty vector of the same underlying class as the receiver
- *
- * @return a Vector
- */
- @Override
- public Vector like() {
- return new RandomAccessSparseVector(size());
- }
-
- @Override
- public Vector like(int cardinality) {
- return new RandomAccessSparseVector(cardinality);
- }
-
- /**
- * Copy the vector for fast operations.
- *
- * @return a Vector
- */
- @Override
- protected Vector createOptimizedCopy() {
- return new RandomAccessSparseVector(size()).assign(this);
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param index an int index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int index, double value) {
- throw new UnsupportedOperationException("Read-only view");
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param index an int index into the receiver
- * @param increment a double value to set
- */
- @Override
- public void incrementQuick(int index, double increment) {
- throw new UnsupportedOperationException("Read-only view");
- }
-
- /**
- * Return the number of values in the recipient which are not the default value. For instance, for
- * a sparse vector, this would be the number of non-zero values.
- *
- * @return an int
- */
- @Override
- public int getNumNondefaultElements() {
- return buffer.limit();
- }
-
- @Override
- public double getLookupCost() {
- return 1;
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return 1;
- }
-
- @Override
- public boolean isAddConstantTime() {
- throw new UnsupportedOperationException("Can't add binary value");
- }
- }
-
- public static class BinaryReadOnlyElement implements Vector.Element {
- private final int index;
-
- public BinaryReadOnlyElement(int index) {
- this.index = index;
- }
-
- /**
- * @return the value of this vector element.
- */
- @Override
- public double get() {
- return 1;
- }
-
- /**
- * @return the index of this vector element.
- */
- @Override
- public int index() {
- return index;
- }
-
- /**
- * @param value Set the current element to value.
- */
- @Override
- public void set(double value) {
- throw new UnsupportedOperationException("Can't set binary value");
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java b/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
deleted file mode 100644
index 9028e23..0000000
--- a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.BackEnum;
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.flavor.TraversingStructureEnum;
-import org.apache.mahout.math.function.IntIntFunction;
-
-/**
- * Matrix View backed by an {@link IntIntFunction}
- */
-class FunctionalMatrixView extends AbstractMatrix {
-
- /**
- * view generator function
- */
- private IntIntFunction gf;
- private boolean denseLike;
- private MatrixFlavor flavor;
-
- public FunctionalMatrixView(int rows, int columns, IntIntFunction gf) {
- this(rows, columns, gf, false);
- }
-
- /**
- * @param gf generator function
- * @param denseLike whether like() should create Dense or Sparse matrix.
- */
- public FunctionalMatrixView(int rows, int columns, IntIntFunction gf, boolean denseLike) {
- super(rows, columns);
- this.gf = gf;
- this.denseLike = denseLike;
- flavor = new MatrixFlavor.FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.BLOCKIFIED, denseLike);
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- throw new UnsupportedOperationException("Assignment to a matrix not supported");
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- throw new UnsupportedOperationException("Assignment to a matrix view not supported");
- }
-
- @Override
- public double getQuick(int row, int column) {
- return gf.apply(row, column);
- }
-
- @Override
- public Matrix like() {
- return like(rows, columns);
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- if (denseLike)
- return new DenseMatrix(rows, columns);
- else
- return new SparseMatrix(rows, columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- throw new UnsupportedOperationException("Assignment to a matrix view not supported");
- }
-
- @Override
- public Vector viewRow(int row) {
- return new MatrixVectorView(this, row, 0, 0, 1, denseLike);
- }
-
- @Override
- public Vector viewColumn(int column) {
- return new MatrixVectorView(this, 0, column, 1, 0, denseLike);
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- return flavor;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/IndexException.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/IndexException.java b/math/src/main/java/org/apache/mahout/math/IndexException.java
deleted file mode 100644
index 489d536..0000000
--- a/math/src/main/java/org/apache/mahout/math/IndexException.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-/**
- * Exception thrown when a matrix or vector is accessed at an index, or dimension,
- * which does not logically exist in the entity.
- */
-public class IndexException extends IllegalArgumentException {
-
- public IndexException(int index, int cardinality) {
- super("Index " + index + " is outside allowable range of [0," + cardinality + ')');
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/LengthCachingVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/LengthCachingVector.java b/math/src/main/java/org/apache/mahout/math/LengthCachingVector.java
deleted file mode 100644
index 770ccc4..0000000
--- a/math/src/main/java/org/apache/mahout/math/LengthCachingVector.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-/**
- * Marker interface for vectors that may cache their squared length.
- */
-interface LengthCachingVector {
- /**
- * Gets the currently cached squared length or if there is none, recalculates
- * the value and returns that.
- * @return The sum of the squares of all elements in the vector.
- */
- double getLengthSquared();
-
- /**
- * Invalidates the length cache. This should be called by all mutators of the vector.
- */
- void invalidateCachedLength();
-}

r***@apache.org

2018-06-27 14:51:37 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Sorting.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Sorting.java b/math/src/main/java/org/apache/mahout/math/Sorting.java
deleted file mode 100644
index 93293ac..0000000
--- a/math/src/main/java/org/apache/mahout/math/Sorting.java
+++ /dev/null
@@ -1,2297 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.Serializable;
-import java.util.Comparator;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.math.function.ByteComparator;
-import org.apache.mahout.math.function.CharComparator;
-import org.apache.mahout.math.function.DoubleComparator;
-import org.apache.mahout.math.function.FloatComparator;
-import org.apache.mahout.math.function.IntComparator;
-import org.apache.mahout.math.function.LongComparator;
-import org.apache.mahout.math.function.ShortComparator;
-
-public final class Sorting {
-
- /* Specifies when to switch to insertion sort */
- private static final int SIMPLE_LENGTH = 7;
- static final int SMALL = 7;
-
- private Sorting() {}
-
- private static <T> int med3(T[] array, int a, int b, int c, Comparator<T> comp) {
- T x = array[a];
- T y = array[b];
- T z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- private static int med3(byte[] array, int a, int b, int c, ByteComparator comp) {
- byte x = array[a];
- byte y = array[b];
- byte z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- private static int med3(char[] array, int a, int b, int c, CharComparator comp) {
- char x = array[a];
- char y = array[b];
- char z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- private static int med3(double[] array, int a, int b, int c,
- DoubleComparator comp) {
- double x = array[a];
- double y = array[b];
- double z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- private static int med3(float[] array, int a, int b, int c,
- FloatComparator comp) {
- float x = array[a];
- float y = array[b];
- float z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- private static int med3(int[] array, int a, int b, int c, IntComparator comp) {
- int x = array[a];
- int y = array[b];
- int z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- /**
- * This is used for 'external' sorting. The comparator takes indices,
- * not values, and compares the external values found at those indices.
- * @param a
- * @param b
- * @param c
- * @param comp
- * @return
- */
- private static int med3(int a, int b, int c, IntComparator comp) {
- int comparisonab = comp.compare(a, b);
- int comparisonac = comp.compare(a, c);
- int comparisonbc = comp.compare(b, c);
- return comparisonab < 0
- ? (comparisonbc < 0 ? b : (comparisonac < 0 ? c : a))
- : (comparisonbc > 0 ? b : (comparisonac > 0 ? c : a));
- }
-
- private static int med3(long[] array, int a, int b, int c, LongComparator comp) {
- long x = array[a];
- long y = array[b];
- long z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- private static int med3(short[] array, int a, int b, int c,
- ShortComparator comp) {
- short x = array[a];
- short y = array[b];
- short z = array[c];
- int comparisonxy = comp.compare(x, y);
- int comparisonxz = comp.compare(x, z);
- int comparisonyz = comp.compare(y, z);
- return comparisonxy < 0 ? (comparisonyz < 0 ? b
- : (comparisonxz < 0 ? c : a)) : (comparisonyz > 0 ? b
- : (comparisonxz > 0 ? c : a));
- }
-
- /**
- * Sorts the specified range in the array in a specified order.
- *
- * @param array
- * the {@code byte} array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @param comp
- * the comparison that determines the sort.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static void quickSort(byte[] array, int start, int end,
- ByteComparator comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static void checkBounds(int arrLength, int start, int end) {
- if (start > end) {
- // K0033=Start index ({0}) is greater than end index ({1})
- throw new IllegalArgumentException("Start index " + start
- + " is greater than end index " + end);
- }
- if (start < 0) {
- throw new ArrayIndexOutOfBoundsException("Array index out of range "
- + start);
- }
- if (end > arrLength) {
- throw new ArrayIndexOutOfBoundsException("Array index out of range "
- + end);
- }
- }
-
- private static void quickSort0(int start, int end, byte[] array, ByteComparator comp) {
- byte temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j - 1], array[j]) > 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- byte partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(array[b], partionValue)) <= 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) >= 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
-
- /**
- * Sorts some external data with QuickSort.
- *
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @param comp
- * the comparator.
- * @param swap an object that can exchange the positions of two items.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static void quickSort(int start, int end, IntComparator comp, Swapper swap) {
- checkBounds(end + 1, start, end);
- quickSort0(start, end, comp, swap);
- }
-
- private static void quickSort0(int start, int end, IntComparator comp, Swapper swap) {
- int length = end - start;
- if (length < 7) {
- insertionSort(start, end, comp, swap);
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- // for lots of data, bottom, middle and top are medians near the beginning, middle or end of the data
- int skosh = length / 8;
- bottom = med3(bottom, bottom + skosh, bottom + (2 * skosh), comp);
- middle = med3(middle - skosh, middle, middle + skosh, comp);
- top = med3(top - (2 * skosh), top - skosh, top, comp);
- }
- middle = med3(bottom, middle, top, comp);
- }
-
- int partitionIndex = middle; // an index, not a value.
-
- // regions from a to b and from c to d are what we will recursively sort
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (b <= c) {
- // copy all values equal to the partition value to before a..b. In the process, advance b
- // as long as values less than the partition or equal are found, also stop when a..b collides with c..d
- int comparison;
- while (b <= c && (comparison = comp.compare(b, partitionIndex)) <= 0) {
- if (comparison == 0) {
- if (a == partitionIndex) {
- partitionIndex = b;
- } else if (b == partitionIndex) {
- partitionIndex = a;
- }
- swap.swap(a, b);
- a++;
- }
- b++;
- }
- // at this point [start..a) has partition values, [a..b) has values < partition
- // also, either b>c or v[b] > partition value
-
- while (c >= b && (comparison = comp.compare(c, partitionIndex)) >= 0) {
- if (comparison == 0) {
- if (c == partitionIndex) {
- partitionIndex = d;
- } else if (d == partitionIndex) {
- partitionIndex = c;
- }
- swap.swap(c, d);
-
- d--;
- }
- c--;
- }
- // now we also know that [d..end] contains partition values,
- // [c..d) contains values > partition value
- // also, either b>c or (v[b] > partition OR v[c] < partition)
-
- if (b <= c) {
- // v[b] > partition OR v[c] < partition
- // swapping will let us continue to grow the two regions
- if (c == partitionIndex) {
- partitionIndex = b;
- } else if (b == partitionIndex) {
- partitionIndex = d;
- }
- swap.swap(b, c);
- b++;
- c--;
- }
- }
- // now we know
- // b = c+1
- // [start..a) and [d..end) contain partition value
- // all of [a..b) are less than partition
- // all of [c..d) are greater than partition
-
- // shift [a..b) to beginning
- length = Math.min(a - start, b - a);
- int l = start;
- int h = b - length;
- while (length-- > 0) {
- swap.swap(l, h);
- l++;
- h++;
- }
-
- // shift [c..d) to end
- length = Math.min(d - c, end - 1 - d);
- l = b;
- h = end - length;
- while (length-- > 0) {
- swap.swap(l, h);
- l++;
- h++;
- }
-
- // recurse left and right
- length = b - a;
- if (length > 0) {
- quickSort0(start, start + length, comp, swap);
- }
-
- length = d - c;
- if (length > 0) {
- quickSort0(end - length, end, comp, swap);
- }
- }
-
- /**
- * In-place insertion sort that is fast for pre-sorted data.
- *
- * @param start Where to start sorting (inclusive)
- * @param end Where to stop (exclusive)
- * @param comp Sort order.
- * @param swap How to swap items.
- */
- private static void insertionSort(int start, int end, IntComparator comp, Swapper swap) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(j - 1, j) > 0; j--) {
- swap.swap(j - 1, j);
- }
- }
- }
- /**
- * Sorts the specified range in the array in a specified order.
- *
- * @param array
- * the {@code char} array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static void quickSort(char[] array, int start, int end, CharComparator comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static void quickSort0(int start, int end, char[] array, CharComparator comp) {
- char temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j - 1], array[j]) > 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- char partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(array[b], partionValue)) <= 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) >= 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
- /**
- * Sorts the specified range in the array in a specified order.
- *
- * @param array
- * the {@code double} array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @param comp
- * the comparison.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- * @see Double#compareTo(Double)
- */
- public static void quickSort(double[] array, int start, int end, DoubleComparator comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static void quickSort0(int start, int end, double[] array, DoubleComparator comp) {
- double temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j], array[j - 1]) < 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- double partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(partionValue, array[b])) >= 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) >= 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
- /**
- * Sorts the specified range in the array in a specified order.
- *
- * @param array
- * the {@code float} array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @param comp
- * the comparator.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static void quickSort(float[] array, int start, int end, FloatComparator comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static void quickSort0(int start, int end, float[] array, FloatComparator comp) {
- float temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j], array[j - 1]) < 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- float partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(partionValue, array[b])) >= 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) >= 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
- /**
- * Sorts the specified range in the array in a specified order.
- *
- * @param array
- * the {@code int} array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @param comp
- * the comparator.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static void quickSort(int[] array, int start, int end, IntComparator comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static void quickSort0(int start, int end, int[] array, IntComparator comp) {
- int temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j - 1], array[j]) > 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- int partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(array[b], partionValue)) <= 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) >= 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
- /**
- * Sorts the specified range in the array in a specified order.
- *
- * @param array
- * the {@code long} array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @param comp
- * the comparator.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static void quickSort(long[] array, int start, int end, LongComparator comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static void quickSort0(int start, int end, long[] array, LongComparator comp) {
- long temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j - 1], array[j]) > 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- long partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(array[b], partionValue)) <= 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) >= 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
- /**
- * Sorts the specified range in the array in a specified order.
- *
- * @param array
- * the array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @param comp
- * the comparator.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static <T> void quickSort(T[] array, int start, int end, Comparator<T> comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static final class ComparableAdaptor<T extends Comparable<? super T>>
- implements Comparator<T>, Serializable {
-
- @Override
- public int compare(T o1, T o2) {
- return o1.compareTo(o2);
- }
-
- }
-
- /**
- * Sort the specified range of an array of object that implement the Comparable
- * interface.
- * @param <T> The type of object.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static <T extends Comparable<? super T>> void quickSort(T[] array, int start, int end) {
- quickSort(array, start, end, new ComparableAdaptor<T>());
- }
-
- private static <T> void quickSort0(int start, int end, T[] array, Comparator<T> comp) {
- T temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j - 1], array[j]) > 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- T partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(array[b], partionValue)) <= 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) >= 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
- /**
- * Sorts the specified range in the array in ascending numerical order.
- *
- * @param array
- * the {@code short} array to be sorted.
- * @param start
- * the start index to sort.
- * @param end
- * the last + 1 index to sort.
- * @throws IllegalArgumentException
- * if {@code start > end}.
- * @throws ArrayIndexOutOfBoundsException
- * if {@code start < 0} or {@code end > array.length}.
- */
- public static void quickSort(short[] array, int start, int end, ShortComparator comp) {
- Preconditions.checkNotNull(array);
- checkBounds(array.length, start, end);
- quickSort0(start, end, array, comp);
- }
-
- private static void quickSort0(int start, int end, short[] array, ShortComparator comp) {
- short temp;
- int length = end - start;
- if (length < 7) {
- for (int i = start + 1; i < end; i++) {
- for (int j = i; j > start && comp.compare(array[j - 1], array[j]) > 0; j--) {
- temp = array[j];
- array[j] = array[j - 1];
- array[j - 1] = temp;
- }
- }
- return;
- }
- int middle = (start + end) / 2;
- if (length > 7) {
- int bottom = start;
- int top = end - 1;
- if (length > 40) {
- length /= 8;
- bottom = med3(array, bottom, bottom + length, bottom + (2 * length),
- comp);
- middle = med3(array, middle - length, middle, middle + length, comp);
- top = med3(array, top - (2 * length), top - length, top, comp);
- }
- middle = med3(array, bottom, middle, top, comp);
- }
- short partionValue = array[middle];
- int a = start;
- int b = a;
- int c = end - 1;
- int d = c;
- while (true) {
- int comparison;
- while (b <= c && (comparison = comp.compare(array[b], partionValue)) < 0) {
- if (comparison == 0) {
- temp = array[a];
- array[a++] = array[b];
- array[b] = temp;
- }
- b++;
- }
- while (c >= b && (comparison = comp.compare(array[c], partionValue)) > 0) {
- if (comparison == 0) {
- temp = array[c];
- array[c] = array[d];
- array[d--] = temp;
- }
- c--;
- }
- if (b > c) {
- break;
- }
- temp = array[b];
- array[b++] = array[c];
- array[c--] = temp;
- }
- length = a - start 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- length = d - c < end - 1 - d ? d - c : end - 1 - d;
- l = b;
- h = end - length;
- while (length-- > 0) {
- temp = array[l];
- array[l++] = array[h];
- array[h++] = temp;
- }
- if ((length = b - a) > 0) {
- quickSort0(start, start + length, array, comp);
- }
- if ((length = d - c) > 0) {
- quickSort0(end - length, end, array, comp);
- }
- }
-
- /**
- * Perform a merge sort on the specified range of an array.
- *
- * @param <T> the type of object in the array.
- * @param array the array.
- * @param start first index.
- * @param end last index (exclusive).
- * @param comp comparator object.
- */
- @SuppressWarnings("unchecked") // required to make the temp array work, afaict.
- public static <T> void mergeSort(T[] array, int start, int end, Comparator<T> comp) {
- checkBounds(array.length, start, end);
- int length = end - start;
- if (length <= 0) {
- return;
- }
-
- T[] out = (T[]) new Object[array.length];
- System.arraycopy(array, start, out, start, length);
- mergeSort(out, array, start, end, comp);
- }
-
- /**
- * Perform a merge sort of the specific range of an array of objects that implement
- * Comparable.
- * @param <T> the type of the objects in the array.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static <T extends Comparable<? super T>> void mergeSort(T[] array, int start, int end) {
- mergeSort(array, start, end, new ComparableAdaptor<T>());
- }
-
- /**
- * Performs a sort on the section of the array between the given indices using
- * a mergesort with exponential search algorithm (in which the merge is
- * performed by exponential search). n*log(n) performance is guaranteed and in
- * the average case it will be faster then any mergesort in which the merge is
- * performed by linear search.
- *
- * @param in
- * - the array for sorting.
- * @param out
- * - the result, sorted array.
- * @param start
- * the start index
- * @param end
- * the end index + 1
- * @param c
- * - the comparator to determine the order of the array.
- */
- private static <T> void mergeSort(T[] in, T[] out, int start, int end, Comparator<T> c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- T current = out[i];
- T prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- T fromVal = in[start];
- T rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- /**
- * Finds the place of specified range of specified sorted array, where the
- * element should be inserted for getting sorted array. Uses exponential
- * search algorithm.
- *
- * @param arr
- * - the array with already sorted range
- * @param val
- * - object to be inserted
- * @param l
- * - the start index
- * @param r
- * - the end index
- * @param bnd
- * - possible values 0,-1. "-1" - val is located at index more then
- * elements equals to val. "0" - val is located at index less then
- * elements equals to val.
- * @param c
- * - the comparator used to compare Objects
- */
- private static <T> int find(T[] arr, T val, int bnd, int l, int r, Comparator<T> c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
- private static final ByteComparator NATURAL_BYTE_COMPARISON = new ByteComparator() {
- @Override
- public int compare(byte o1, byte o2) {
- return o1 - o2;
- }
- };
-
- /**
- * Perform a merge sort on a range of a byte array, using numerical order.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static void mergeSort(byte[] array, int start, int end) {
- mergeSort(array, start, end, NATURAL_BYTE_COMPARISON);
- }
-
- /**
- * Perform a merge sort on a range of a byte array using a specified ordering.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param comp the comparator object.
- */
- public static void mergeSort(byte[] array, int start, int end, ByteComparator comp) {
- checkBounds(array.length, start, end);
- byte[] out = Arrays.copyOf(array, array.length);
- mergeSort(out, array, start, end, comp);
- }
-
- private static void mergeSort(byte[] in, byte[] out, int start, int end, ByteComparator c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- byte current = out[i];
- byte prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- byte fromVal = in[start];
- byte rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- private static int find(byte[] arr, byte val, int bnd, int l, int r, ByteComparator c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
- private static final CharComparator NATURAL_CHAR_COMPARISON = new CharComparator() {
- @Override
- public int compare(char o1, char o2) {
- return o1 - o2;
- }
- };
-
- /**
- * Perform a merge sort on a range of a char array, using numerical order.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static void mergeSort(char[] array, int start, int end) {
- mergeSort(array, start, end, NATURAL_CHAR_COMPARISON);
- }
-
- /**
- * Perform a merge sort on a range of a char array using a specified ordering.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param comp the comparator object.
- */
- public static void mergeSort(char[] array, int start, int end, CharComparator comp) {
- checkBounds(array.length, start, end);
- char[] out = Arrays.copyOf(array, array.length);
- mergeSort(out, array, start, end, comp);
- }
-
- private static void mergeSort(char[] in, char[] out, int start, int end, CharComparator c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- char current = out[i];
- char prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- char fromVal = in[start];
- char rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- private static int find(char[] arr, char val, int bnd, int l, int r, CharComparator c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
- private static final ShortComparator NATURAL_SHORT_COMPARISON = new ShortComparator() {
- @Override
- public int compare(short o1, short o2) {
- return o1 - o2;
- }
- };
-
- /**
- * Perform a merge sort on a range of a short array, using numerical order.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static void mergeSort(short[] array, int start, int end) {
- mergeSort(array, start, end, NATURAL_SHORT_COMPARISON);
- }
-
- public static void mergeSort(short[] array, int start, int end, ShortComparator comp) {
- checkBounds(array.length, start, end);
- short[] out = Arrays.copyOf(array, array.length);
- mergeSort(out, array, start, end, comp);
- }
-
-
- /**
- * Perform a merge sort on a range of a short array using a specified ordering.
- * @param in the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param c the comparator object.
- */
- private static void mergeSort(short[] in, short[] out, int start, int end, ShortComparator c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- short current = out[i];
- short prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- short fromVal = in[start];
- short rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- private static int find(short[] arr, short val, int bnd, int l, int r, ShortComparator c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
- private static final IntComparator NATURAL_INT_COMPARISON = new IntComparator() {
- @Override
- public int compare(int o1, int o2) {
- return o1 < o2 ? -1 : o1 > o2 ? 1 : 0;
- }
- };
-
- public static void mergeSort(int[] array, int start, int end) {
- mergeSort(array, start, end, NATURAL_INT_COMPARISON);
- }
-
- /**
- * Perform a merge sort on a range of a int array using numerical order.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param comp the comparator object.
- */
- public static void mergeSort(int[] array, int start, int end, IntComparator comp) {
- checkBounds(array.length, start, end);
- int[] out = Arrays.copyOf(array, array.length);
- mergeSort(out, array, start, end, comp);
- }
-
- /**
- * Perform a merge sort on a range of a int array using a specified ordering.
- * @param in the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param c the comparator object.
- */
- private static void mergeSort(int[] in, int[] out, int start, int end, IntComparator c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- int current = out[i];
- int prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- int fromVal = in[start];
- int rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- private static int find(int[] arr, int val, int bnd, int l, int r, IntComparator c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
-
- private static final LongComparator NATURAL_LONG_COMPARISON = new LongComparator() {
- @Override
- public int compare(long o1, long o2) {
- return o1 < o2 ? -1 : o1 > o2 ? 1 : 0;
- }
- };
-
- /**
- * Perform a merge sort on a range of a long array using numerical order.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static void mergeSort(long[] array, int start, int end) {
- mergeSort(array, start, end, NATURAL_LONG_COMPARISON);
- }
-
- /**
- * Perform a merge sort on a range of a long array using a specified ordering.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param comp the comparator object.
- */
- public static void mergeSort(long[] array, int start, int end, LongComparator comp) {
- checkBounds(array.length, start, end);
- long[] out = Arrays.copyOf(array, array.length);
- mergeSort(out, array, start, end, comp);
- }
-
- private static void mergeSort(long[] in, long[] out, int start, int end, LongComparator c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- long current = out[i];
- long prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- long fromVal = in[start];
- long rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- private static int find(long[] arr, long val, int bnd, int l, int r, LongComparator c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
- private static final FloatComparator NATURAL_FLOAT_COMPARISON = new FloatComparator() {
- @Override
- public int compare(float o1, float o2) {
- return Float.compare(o1, o2);
- }
- };
-
- /**
- * Perform a merge sort on a range of a float array using Float.compare for an ordering.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static void mergeSort(float[] array, int start, int end) {
- mergeSort(array, start, end, NATURAL_FLOAT_COMPARISON);
- }
-
- /**
- * Perform a merge sort on a range of a float array using a specified ordering.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param comp the comparator object.
- */
- public static void mergeSort(float[] array, int start, int end, FloatComparator comp) {
- checkBounds(array.length, start, end);
- float[] out = Arrays.copyOf(array, array.length);
- mergeSort(out, array, start, end, comp);
- }
-
- private static void mergeSort(float[] in, float[] out, int start, int end, FloatComparator c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- float current = out[i];
- float prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- float fromVal = in[start];
- float rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- private static int find(float[] arr, float val, int bnd, int l, int r, FloatComparator c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
- private static final DoubleComparator NATURAL_DOUBLE_COMPARISON = new DoubleComparator() {
- @Override
- public int compare(double o1, double o2) {
- return Double.compare(o1, o2);
- }
- };
-
-
- /**
- * Perform a merge sort on a range of a double array using a Double.compare as an ordering.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- */
- public static void mergeSort(double[] array, int start, int end) {
- mergeSort(array, start, end, NATURAL_DOUBLE_COMPARISON);
- }
-
- /**
- * Perform a merge sort on a range of a double array using a specified ordering.
- * @param array the array.
- * @param start the first index.
- * @param end the last index (exclusive).
- * @param comp the comparator object.
- */
- public static void mergeSort(double[] array, int start, int end, DoubleComparator comp) {
- checkBounds(array.length, start, end);
- double[] out = Arrays.copyOf(array, array.length);
- mergeSort(out, array, start, end, comp);
- }
-
- private static void mergeSort(double[] in, double[] out, int start, int end, DoubleComparator c) {
- int len = end - start;
- // use insertion sort for small arrays
- if (len <= SIMPLE_LENGTH) {
- for (int i = start + 1; i < end; i++) {
- double current = out[i];
- double prev = out[i - 1];
- if (c.compare(prev, current) > 0) {
- int j = i;
- do {
- out[j--] = prev;
- } while (j > start && (c.compare(prev = out[j - 1], current) > 0));
- out[j] = current;
- }
- }
- return;
- }
- int med = (end + start) >>> 1;
- mergeSort(out, in, start, med, c);
- mergeSort(out, in, med, end, c);
-
- // merging
-
- // if arrays are already sorted - no merge
- if (c.compare(in[med - 1], in[med]) <= 0) {
- System.arraycopy(in, start, out, start, len);
- return;
- }
- int r = med;
- int i = start;
-
- // use merging with exponential search
- do {
- double fromVal = in[start];
- double rVal = in[r];
- if (c.compare(fromVal, rVal) <= 0) {
- int l_1 = find(in, rVal, -1, start + 1, med - 1, c);
- int toCopy = l_1 - start + 1;
- System.arraycopy(in, start, out, i, toCopy);
- i += toCopy;
- out[i++] = rVal;
- r++;
- start = l_1 + 1;
- } else {
- int r_1 = find(in, fromVal, 0, r + 1, end - 1, c);
- int toCopy = r_1 - r + 1;
- System.arraycopy(in, r, out, i, toCopy);
- i += toCopy;
- out[i++] = fromVal;
- start++;
- r = r_1 + 1;
- }
- } while ((end - r) > 0 && (med - start) > 0);
-
- // copy rest of array
- if ((end - r) <= 0) {
- System.arraycopy(in, start, out, i, med - start);
- } else {
- System.arraycopy(in, r, out, i, end - r);
- }
- }
-
- private static int find(double[] arr, double val, int bnd, int l, int r, DoubleComparator c) {
- int m = l;
- int d = 1;
- while (m <= r) {
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- break;
- }
- m += d;
- d <<= 1;
- }
- while (l <= r) {
- m = (l + r) >>> 1;
- if (c.compare(val, arr[m]) > bnd) {
- l = m + 1;
- } else {
- r = m - 1;
- }
- }
- return l - 1;
- }
-
- /**
- * Transforms two consecutive sorted ranges into a single sorted range. The initial ranges are {@code [first,}
- * middle)</code> and {@code [middle, last)}, and the resulting range is {@code [first, last)}. Elements in
- * the first input range will precede equal elements in the second.
- */
- static void inplaceMerge(int first, int middle, int last, IntComparator comp, Swapper swapper) {
- if (first >= middle || middle >= last) {
- return;
- }
- if (last - first == 2) {
- if (comp.compare(middle, first) < 0) {
- swapper.swap(first, middle);
- }
- return;
- }
- int firstCut;
- int secondCut;
- if (middle - first > last - middle) {
- firstCut = first + (middle - first) / 2;
- secondCut = lowerBound(middle, last, firstCut, comp);
- } else {
- secondCut = middle + (last - middle) / 2;
- firstCut = upperBound(first, middle, secondCut, comp);
- }
-
- // rotate(firstCut, middle, secondCut, swapper);
- // is manually inlined for speed (jitter inlining seems to work only for small call depths, even if methods
- // are "static private")
- // speedup = 1.7
- // begin inline
- int first2 = firstCut;
- int middle2 = middle;
- int last2 = secondCut;
- if (middle2 != first2 && middle2 != last2) {
- int first1 = first2;
- int last1 = middle2;
- while (first1 < --last1) {
- swapper.swap(first1++, last1);
- }
- first1 = middle2;
- last1 = last2;
- while (first1 < --last1) {
- swapper.swap(first1++, last1);
- }
- first1 = first2;
- last1 = last2;
- while (first1 < --last1) {
- swapper.swap(first1++, last1);
- }
- }
- // end inline
-
- middle = firstCut + (secondCut - middle);
- inplaceMerge(first, firstCut, middle, comp, swapper);
- inplaceMerge(middle, secondCut, last, comp, swapper);
- }
-
- /**
- * Performs a binary search on an already-sorted range: finds the first position where an element can be inserted
- * without violating the ordering. Sorting is by a user-supplied comparison function.
- *
- * @param first Beginning of the range.
- * @param last One past the end of the range.
- * @param x Element to be searched for.
- * @param comp Comparison function.
- * @return The largest index i such that, for every j in the range <code>[first, i)</code>,
- * <code></code></codeA>{@code comp.apply(array[j], x)</code> is {@code true}.
- * @see Sorting#upperBound
- */
- static int lowerBound(int first, int last, int x, IntComparator comp) {
- int len = last - first;
- while (len > 0) {
- int half = len / 2;
- int middle = first + half;
- if (comp.compare(middle, x) < 0) {
- first = middle + 1;
- len -= half + 1;
- } else {
- len = half;
- }
- }
- return first;
- }
-
- /**
- * Sorts the specified range of elements according to the order induced by the specified comparator. All elements in
- * the range must be mutually comparable by the specified comparator (that is, <tt>c.compare(a, b)</tt> must
- * not throw an exception for any indexes <tt>a</tt> and <tt>b</tt> in the range).
- *
- * This sort is guaranteed to be stable: equal elements will not be reordered as a result of the sort.
- *
- * The sorting algorithm is a modified mergesort (in which the merge is omitted if the highest element in the low
- * sublist is less than the lowest element in the high sublist). This algorithm offers guaranteed n*log(n)
- * performance, and can approach linear performance on nearly sorted lists.
- *
- * @param fromIndex the index of the first element (inclusive) to be sorted.
- * @param toIndex the index of the last element (exclusive) to be sorted.
- * @param c the comparator to determine the order of the generic data.
- * @param swapper an object that knows how to swap the elements at any two indexes (a,b).
- * @see IntComparator
- * @see Swapper
- */
- public static void mergeSort(int fromIndex, int toIndex, IntComparator c, Swapper swapper) {
- /*
- We retain the same method signature as quickSort.
- Given only a comparator and swapper we do not know how to copy and move elements from/to temporary arrays.
- Hence, in contrast to the JDK mergesorts this is an "in-place" mergesort, i.e. does not allocate any temporary
- arrays.
- A non-inplace mergesort would perhaps be faster in most cases, but would require non-intuitive delegate objects...
- */
- int length = toIndex - fromIndex;
-
- // Insertion sort on smallest arrays
- if (length < SMALL) {
- for (int i = fromIndex; i < toIndex; i++) {
- for (int j = i; j > fromIndex && (c.compare(j - 1, j) > 0); j--) {
- swapper.swap(j, j - 1);
- }
- }
- return;
- }
-
- // Recursively sort halves
- int mid = (fromIndex + toIndex) / 2;
- mergeSort(fromIndex, mid, c, swapper);
- mergeSort(mid, toIndex, c, swapper);
-
- // If list is already sorted, nothing left to do. This is an
- // optimization that results in faster sorts for nearly ordered lists.
- if (c.compare(mid - 1, mid) <= 0) {
- return;
- }
-
- // Merge sorted halves
- inplaceMerge(fromIndex, mid, toIndex, c, swapper);
- }
-
- /**
- * Performs a binary search on an already-sorted range: finds the last position where an element can be inserted
- * without violating the ordering. Sorting is by a user-supplied comparison function.
- *
- * @param first Beginning of the range.
- * @param last One past the end of the range.
- * @param x Element to be searched for.
- * @param comp Comparison function.
- * @return The largest index i such that, for every j in the range <code>[first, i)</code>, {@code comp.apply(x,}
- * array[j])</code> is {@code false}.
- * @see Sorting#lowerBound
- */
- static int upperBound(int first, int last, int x, IntComparator comp) {
- int len = last - first;
- while (len > 0) {
- int half = len / 2;
- int middle = first + half;
- if (comp.compare(x, middle) < 0) {
- len = half;
- } else {
- first = middle + 1;
- len -= half + 1;
- }
- }
- return first;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
deleted file mode 100644
index eeffc78..0000000
--- a/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
+++ /dev/null
@@ -1,220 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.TraversingStructureEnum;
-
-/**
- * sparse matrix with general element values whose columns are accessible quickly. Implemented as a column array of
- * SparseVectors.
- *
- * @deprecated tons of inconsistences. Use transpose view of SparseRowMatrix for fast column-wise iteration.
- */
-public class SparseColumnMatrix extends AbstractMatrix {
-
- private Vector[] columnVectors;
-
- /**
- * Construct a matrix of the given cardinality with the given data columns
- *
- * @param columns a RandomAccessSparseVector[] array of columns
- * @param columnVectors
- */
- public SparseColumnMatrix(int rows, int columns, Vector[] columnVectors) {
- this(rows, columns, columnVectors, false);
- }
-
- public SparseColumnMatrix(int rows, int columns, Vector[] columnVectors, boolean shallow) {
- super(rows, columns);
- if (shallow) {
- this.columnVectors = columnVectors;
- } else {
- this.columnVectors = columnVectors.clone();
- for (int col = 0; col < columnSize(); col++) {
- this.columnVectors[col] = this.columnVectors[col].clone();
- }
- }
- }
-
- /**
- * Construct a matrix of the given cardinality
- *
- * @param rows # of rows
- * @param columns # of columns
- */
- public SparseColumnMatrix(int rows, int columns) {
- super(rows, columns);
- this.columnVectors = new RandomAccessSparseVector[columnSize()];
- for (int col = 0; col < columnSize(); col++) {
- this.columnVectors[col] = new RandomAccessSparseVector(rowSize());
- }
- }
-
- @Override
- public Matrix clone() {
- SparseColumnMatrix clone = (SparseColumnMatrix) super.clone();
- clone.columnVectors = new Vector[columnVectors.length];
- for (int i = 0; i < columnVectors.length; i++) {
- clone.columnVectors[i] = columnVectors[i].clone();
- }
- return clone;
- }
-
- /**
- * Abstracted out for the iterator
- *
- * @return {@link #numCols()}
- */
- @Override
- public int numSlices() {
- return numCols();
- }
-
- @Override
- public double getQuick(int row, int column) {
- return columnVectors[column] == null ? 0.0 : columnVectors[column].getQuick(row);
- }
-
- @Override
- public Matrix like() {
- return new SparseColumnMatrix(rowSize(), columnSize());
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- return new SparseColumnMatrix(rows, columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- if (columnVectors[column] == null) {
- columnVectors[column] = new RandomAccessSparseVector(rowSize());
- }
- columnVectors[column].setQuick(row, value);
- }
-
- @Override
- public int[] getNumNondefaultElements() {
- int[] result = new int[2];
- result[COL] = columnVectors.length;
- for (int col = 0; col < columnSize(); col++) {
- result[ROW] = Math.max(result[ROW], columnVectors[col]
- .getNumNondefaultElements());
- }
- return result;
- }
-
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- if (offset[ROW] < 0) {
- throw new IndexException(offset[ROW], columnVectors[COL].size());
- }
- if (offset[ROW] + size[ROW] > columnVectors[COL].size()) {
- throw new IndexException(offset[ROW] + size[ROW], columnVectors[COL].size());
- }
- if (offset[COL] < 0) {
- throw new IndexException(offset[COL], columnVectors.length);
- }
- if (offset[COL] + size[COL] > columnVectors.length) {
- throw new IndexException(offset[COL] + size[COL], columnVectors.length);
- }
- return new MatrixView(this, offset, size);
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- if (rowSize() != other.size()) {
- throw new CardinalityException(rowSize(), other.size());
- }
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- columnVectors[column].assign(other);
- return this;
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- if (columnSize() != other.size()) {
- throw new CardinalityException(columnSize(), other.size());
- }
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- for (int col = 0; col < columnSize(); col++) {
- columnVectors[col].setQuick(row, other.getQuick(col));
- }
- return this;
- }
-
- @Override
- public Vector viewColumn(int column) {
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- return columnVectors[column];
- }
-
- @Override
- public Matrix transpose() {
- SparseRowMatrix srm = new SparseRowMatrix(columns, rows);
- for (int i = 0; i < columns; i++) {
- Vector col = columnVectors[i];
- if (col.getNumNonZeroElements() > 0)
- // this should already be optimized
- srm.assignRow(i, col);
- }
- return srm;
- }
-
- @Override
- public String toString() {
- int row = 0;
- int maxRowsToDisplay = 10;
- int maxColsToDisplay = 20;
- int colsToDisplay = maxColsToDisplay;
-
- if(maxColsToDisplay > columnSize()){
- colsToDisplay = columnSize();
- }
-
- StringBuilder s = new StringBuilder("{\n");
- for (MatrixSlice next : this.transpose()) {
- if (row < maxRowsToDisplay) {
- s.append(" ")
- .append(next.index())
- .append(" =>\t")
- .append(new VectorView(next.vector(), 0, colsToDisplay))
- .append('\n');
- row++;
- }
- }
-
- String returnString = s.toString();
- if (maxColsToDisplay <= columnSize()) {
- returnString = returnString.replace("}", " ... }");
- }
-
- if (maxRowsToDisplay <= rowSize()) {
- return returnString + "... }";
- } else {
- return returnString + "}";
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SparseMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
deleted file mode 100644
index a75ac55..0000000
--- a/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import it.unimi.dsi.fastutil.ints.Int2ObjectMap.Entry;
-import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
-import it.unimi.dsi.fastutil.objects.ObjectIterator;
-
-import java.util.Iterator;
-import java.util.Map;
-
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.math.list.IntArrayList;
-
-import com.google.common.collect.AbstractIterator;
-
-/** Doubly sparse matrix. Implemented as a Map of RandomAccessSparseVector rows */
-public class SparseMatrix extends AbstractMatrix {
-
- private Int2ObjectOpenHashMap<Vector> rowVectors;
-
- /**
- * Construct a matrix of the given cardinality with the given row map
- *
- * @param rows no of rows
- * @param columns no of columns
- * @param rowVectors a {@code Map<Integer, RandomAccessSparseVector>} of rows
- */
- public SparseMatrix(int rows, int columns, Map<Integer, Vector> rowVectors) {
- this(rows, columns, rowVectors, false);
- }
-
- public SparseMatrix(int rows, int columns, Map<Integer, Vector> rowVectors, boolean shallow) {
-
- // Why this is passing in a map? iterating it is pretty inefficient as opposed to simple lists...
- super(rows, columns);
- this.rowVectors = new Int2ObjectOpenHashMap<>();
- if (shallow) {
- for (Map.Entry<Integer, Vector> entry : rowVectors.entrySet()) {
- this.rowVectors.put(entry.getKey().intValue(), entry.getValue());
- }
- } else {
- for (Map.Entry<Integer, Vector> entry : rowVectors.entrySet()) {
- this.rowVectors.put(entry.getKey().intValue(), entry.getValue().clone());
- }
- }
- }
-
- /**
- * Construct a matrix with specified number of rows and columns.
- */
- public SparseMatrix(int rows, int columns) {
- super(rows, columns);
- this.rowVectors = new Int2ObjectOpenHashMap<>();
- }
-
- @Override
- public Matrix clone() {
- SparseMatrix clone = new SparseMatrix(numRows(), numCols());
- for (MatrixSlice slice : this) {
- clone.rowVectors.put(slice.index(), slice.clone());
- }
- return clone;
- }
-
- @Override
- public int numSlices() {
- return rowVectors.size();
- }
-
- public Iterator<MatrixSlice> iterateNonEmpty() {
- final int[] keys = rowVectors.keySet().toIntArray();
- return new AbstractIterator<MatrixSlice>() {
- private int slice;
- @Override
- protected MatrixSlice computeNext() {
- if (slice >= rowVectors.size()) {
- return endOfData();
- }
- int i = keys[slice];
- Vector row = rowVectors.get(i);
- slice++;
- return new MatrixSlice(row, i);
- }
- };
- }
-
- @Override
- public double getQuick(int row, int column) {
- Vector r = rowVectors.get(row);
- return r == null ? 0.0 : r.getQuick(column);
- }
-
- @Override
- public Matrix like() {
- return new SparseMatrix(rowSize(), columnSize());
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- return new SparseMatrix(rows, columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- Vector r = rowVectors.get(row);
- if (r == null) {
- r = new RandomAccessSparseVector(columnSize());
- rowVectors.put(row, r);
- }
- r.setQuick(column, value);
- }
-
- @Override
- public int[] getNumNondefaultElements() {
- int[] result = new int[2];
- result[ROW] = rowVectors.size();
- for (Vector row : rowVectors.values()) {
- result[COL] = Math.max(result[COL], row.getNumNondefaultElements());
- }
- return result;
- }
-
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- if (offset[ROW] < 0) {
- throw new IndexException(offset[ROW], rowSize());
- }
- if (offset[ROW] + size[ROW] > rowSize()) {
- throw new IndexException(offset[ROW] + size[ROW], rowSize());
- }
- if (offset[COL] < 0) {
- throw new IndexException(offset[COL], columnSize());
- }
- if (offset[COL] + size[COL] > columnSize()) {
- throw new IndexException(offset[COL] + size[COL], columnSize());
- }
- return new MatrixView(this, offset, size);
- }
-
- @Override
- public Matrix assign(Matrix other, DoubleDoubleFunction function) {
- //TODO generalize to other kinds of functions
- if (Functions.PLUS.equals(function) && other instanceof SparseMatrix) {
- int rows = rowSize();
- if (rows != other.rowSize()) {
- throw new CardinalityException(rows, other.rowSize());
- }
- int columns = columnSize();
- if (columns != other.columnSize()) {
- throw new CardinalityException(columns, other.columnSize());
- }
-
- SparseMatrix otherSparse = (SparseMatrix) other;
- for(ObjectIterator<Entry<Vector>> fastIterator = otherSparse.rowVectors.int2ObjectEntrySet().fastIterator();
- fastIterator.hasNext();) {
- final Entry<Vector> entry = fastIterator.next();
- final int rowIndex = entry.getIntKey();
- Vector row = rowVectors.get(rowIndex);
- if (row == null) {
- rowVectors.put(rowIndex, entry.getValue().clone());
- } else {
- row.assign(entry.getValue(), Functions.PLUS);
- }
- }
- return this;
- } else {
- return super.assign(other, function);
- }
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- if (rowSize() != other.size()) {
- throw new CardinalityException(rowSize(), other.size());
- }
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- for (int row = 0; row < rowSize(); row++) {
- double val = other.getQuick(row);
- if (val != 0.0) {
- Vector r = rowVectors.get(row);
- if (r == null) {
- r = new RandomAccessSparseVector(columnSize());
- rowVectors.put(row, r);
- }
- r.setQuick(column, val);
- }
- }
- return this;
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- if (columnSize() != other.size()) {
- throw new CardinalityException(columnSize(), other.size());
- }
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- rowVectors.put(row, other);
- return this;
- }
-
- @Override
- public Vector viewRow(int row) {
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- Vector res = rowVectors.get(row);
- if (res == null) {
- res = new RandomAccessSparseVector(columnSize());
- rowVectors.put(row, res);
- }
- return res;
- }
-
- /** special method necessary for efficient serialization */
- public IntArrayList nonZeroRowIndices() {
- return new IntArrayList(rowVectors.keySet().toIntArray());
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- return MatrixFlavor.SPARSEROWLIKE;
- }
-}

r***@apache.org

2018-06-27 14:51:44 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/list/AbstractValueTypeList.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/list/AbstractValueTypeList.java.t b/math/src/main/java-templates/org/apache/mahout/math/list/AbstractValueTypeList.java.t
deleted file mode 100644
index 343472a..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/list/AbstractValueTypeList.java.t
+++ /dev/null
@@ -1,851 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
- /*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.list;
-//CHECKSTYLE:OFF
-import org.apache.mahout.math.Sorting;
-import org.apache.mahout.math.buffer.${valueTypeCap}BufferConsumer;
-import org.apache.mahout.math.function.${valueTypeCap}Comparator;
-import org.apache.mahout.math.function.${valueTypeCap}Procedure;
-//CHECKSTYLE:ON
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- Abstract base class for resizable lists holding <code>${valueType}</code> elements; abstract.
-*/
-public abstract class Abstract${valueTypeCap}List extends AbstractList implements ${valueTypeCap}BufferConsumer {
-
- /**
- * The size of the list. This is a READ_ONLY variable for all methods but setSizeRaw(int newSize) !!! If you violate
- * this principle in subclasses, you should exactly know what you are doing.
- */
- protected int size;
-
- /**
- * Appends the specified element to the end of this list.
- *
- * @param element element to be appended to this list.
- */
- public void add(${valueType} element) {
- beforeInsert(size, element);
- }
-
- /**
- * Appends all elements of the specified list to the receiver.
- *
- * @param other the list of which all elements shall be appended.
- */
- public void addAllOf(Abstract${valueTypeCap}List other) {
- addAllOfFromTo(other, 0, other.size() - 1);
- }
-
- /**
- * Appends the part of the specified list between <code>from</code> (inclusive) and <code>to</code> (inclusive) to the
- * receiver.
- *
- * @param other the list to be added to the receiver.
- * @param from the index of the first element to be appended (inclusive).
- * @param to the index of the last element to be appended (inclusive).
- * @throws IndexOutOfBoundsException index is out of range (<tt>other.size()>0 && (from<0 || from>to ||
- * to>=other.size())</tt>).
- */
- public void addAllOfFromTo(Abstract${valueTypeCap}List other, int from, int to) {
- beforeInsertAllOfFromTo(size, other, from, to);
- }
-
- /**
- * Appends the specified list to the end of this list.
- * @param other the list to be appended.
- **/
- @Override
- public void addAllOf(${valueTypeCap}ArrayList other) {
- addAllOfFromTo(other, 0, other.size() - 1);
- }
-
- /**
- * Inserts the specified element before the specified position into the receiver. Shifts the element currently at that
- * position (if any) and any subsequent elements to the right.
- *
- * @param index index before which the specified element is to be inserted (must be in [0,size]).
- * @param element element to be inserted.
- * @throws IndexOutOfBoundsException index is out of range (<tt>index < 0 || index > size()</tt>).
- */
- public void beforeInsert(int index, ${valueType} element) {
- beforeInsertDummies(index, 1);
- set(index, element);
- }
-
- /**
- * Inserts the part of the specified list between <code>otherFrom</code> (inclusive) and <code>otherTo</code>
- * (inclusive) before the specified position into the receiver. Shifts the element currently at that position (if any)
- * and any subsequent elements to the right.
- *
- * @param index index before which to insert first element from the specified list (must be in [0,size])..
- * @param other list of which a part is to be inserted into the receiver.
- * @param from the index of the first element to be inserted (inclusive).
- * @param to the index of the last element to be inserted (inclusive).
- * @throws IndexOutOfBoundsException index is out of range (<tt>other.size()>0 && (from<0 || from>to ||
- * to>=other.size())</tt>).
- * @throws IndexOutOfBoundsException index is out of range (<tt>index < 0 || index > size()</tt>).
- */
- public void beforeInsertAllOfFromTo(int index, Abstract${valueTypeCap}List other, int from, int to) {
- int length = to - from + 1;
- this.beforeInsertDummies(index, length);
- this.replaceFromToWithFrom(index, index + length - 1, other, from);
- }
-
- /**
- * Inserts <tt>length</tt> dummy elements before the specified position into the receiver. Shifts the element
- * currently at that position (if any) and any subsequent elements to the right. This method must set the new size
- * to be <tt>size()+length</tt>.
- *
- * @param index index before which to insert dummy elements (must be in [0,size])..
- * @param length number of dummy elements to be inserted.
- * @throws IndexOutOfBoundsException if <tt>index < 0 || index > size()</tt>.
- */
- @Override
- protected void beforeInsertDummies(int index, int length) {
- if (index > size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- if (length > 0) {
- ensureCapacity(size + length);
- setSizeRaw(size + length);
- replaceFromToWithFrom(index + length, size - 1, this, index);
- }
- }
-
- /**
- * Searches the receiver for the specified value using the binary search algorithm. The receiver must
- * must be sorted (as by the sort method) prior to making this call. If it is not sorted, the
- * results are undefined: in particular, the call may enter an infinite loop. If the receiver contains multiple
- * elements equal to the specified object, there is no guarantee which instance will be found.
- *
- * @param key the value to be searched for.
- * @return index of the search key, if it is contained in the receiver; otherwise, <tt>(-(insertion point) -
- * 1)</tt>. The insertion point is defined as the the point at which the value would be inserted into
- * the receiver: the index of the first element greater than the key, or <tt>receiver.size()</tt>, if all
- * elements in the receiver are less than the specified key. Note that this guarantees that the return value
- * will be >= 0 if and only if the key is found.
- * @see java.util.Arrays
- */
- public int binarySearch(${valueType} key) {
- return this.binarySearchFromTo(key, 0, size - 1);
- }
-
- /**
- * Searches the receiver for the specified value using the binary search algorithm. The receiver must
- * must be sorted (as by the sort method) prior to making this call. If it is not sorted, the
- * results are undefined: in particular, the call may enter an infinite loop. If the receiver contains multiple
- * elements equal to the specified object, there is no guarantee which instance will be found.
- *
- * @param key the value to be searched for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return index of the search key, if it is contained in the receiver; otherwise, <tt>(-(insertion point) -
- * 1)</tt>. The insertion point is defined as the the point at which the value would be inserted into
- * the receiver: the index of the first element greater than the key, or <tt>receiver.size()</tt>, if all
- * elements in the receiver are less than the specified key. Note that this guarantees that the return value
- * will be >= 0 if and only if the key is found.
- * @see java.util.Arrays
- */
- public int binarySearchFromTo(${valueType} key, int from, int to) {
- int low = from;
- int high = to;
- while (low <= high) {
- int mid = (low + high) / 2;
- ${valueType} midVal = get(mid);
-
- if (midVal < key) {
- low = mid + 1;
- } else if (midVal > key) {
- high = mid - 1;
- } else {
- return mid;
- } // key found
- }
- return -(low + 1); // key not found.
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @Override
- public Object clone() {
- return partFromTo(0, size - 1);
- }
-
- /**
- * Returns true if the receiver contains the specified element.
- *
- * @param elem element whose presence in the receiver is to be tested.
- */
- public boolean contains(${valueType} elem) {
- return indexOfFromTo(elem, 0, size - 1) >= 0;
- }
-
- /**
- * Deletes the first element from the receiver that is identical to the specified element. Does nothing, if no such
- * matching element is contained.
- *
- * @param element the element to be deleted.
- */
- public void delete(${valueType} element) {
- int index = indexOfFromTo(element, 0, size - 1);
- if (index >= 0) {
- remove(index);
- }
- }
-
- /**
- * Returns the elements currently stored, possibly including invalid elements between size and capacity.
- *
- * WARNING: For efficiency reasons and to keep memory usage low, this method may decide not to copy the
- * array. So if subsequently you modify the returned array directly via the [] operator, be sure you know what
- * you're doing.
- *
- * @return the elements currently stored.
- */
- public ${valueType}[] elements() {
- ${valueType}[] myElements = new ${valueType}[size];
- for (int i = size; --i >= 0;) {
- myElements[i] = getQuick(i);
- }
- return myElements;
- }
-
- /**
- * Sets the receiver's elements to be the specified array. The size and capacity of the list is the length of the
- * array. WARNING: For efficiency reasons and to keep memory usage low, this method may decide not to copy
- * the array. So if subsequently you modify the returned array directly via the [] operator, be sure you know what
- * you're doing.
- *
- * @param elements the new elements to be stored.
- * @return the receiver itself.
- */
- public Abstract${valueTypeCap}List elements(${valueType}[] elements) {
- clear();
- addAllOfFromTo(new ${valueTypeCap}ArrayList(elements), 0, elements.length - 1);
- return this;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of elements without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public abstract void ensureCapacity(int minCapacity);
-
- /**
- * Compares the specified Object with the receiver. Returns true if and only if the specified Object is also an
- * ArrayList of the same type, both Lists have the same size, and all corresponding pairs of elements in the two Lists
- * are identical. In other words, two Lists are defined to be equal if they contain the same elements in the same
- * order.
- *
- * @param otherObj the Object to be compared for equality with the receiver.
- * @return true if the specified Object is equal to the receiver.
- */
- public boolean equals(Object otherObj) { //delta
- if (otherObj == null) {
- return false;
- }
- if (!(otherObj instanceof Abstract${valueTypeCap}List)) {
- return false;
- }
- if (this == otherObj) {
- return true;
- }
- Abstract${valueTypeCap}List other = (Abstract${valueTypeCap}List) otherObj;
- if (size() != other.size()) {
- return false;
- }
-
- for (int i = size(); --i >= 0;) {
- if (getQuick(i) != other.getQuick(i)) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Sets the specified range of elements in the specified array to the specified value.
- *
- * @param from the index of the first element (inclusive) to be filled with the specified value.
- * @param to the index of the last element (inclusive) to be filled with the specified value.
- * @param val the value to be stored in the specified elements of the receiver.
- */
- public void fillFromToWith(int from, int to, ${valueType} val) {
- checkRangeFromTo(from, to, this.size);
- for (int i = from; i <= to;) {
- setQuick(i++, val);
- }
- }
-
- /**
- * Applies a procedure to each element of the receiver, if any. Starts at index 0, moving rightwards.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all elements where iterated over, <tt>true</tt> otherwise.
- */
- public boolean forEach(${valueTypeCap}Procedure procedure) {
- for (int i = 0; i < size;) {
- if (!procedure.apply(get(i++))) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the element at the specified position in the receiver.
- *
- * @param index index of element to return.
- * @throws IndexOutOfBoundsException index is out of range (index < 0 || index >= size()).
- */
- public ${valueType} get(int index) {
- if (index >= size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- return getQuick(index);
- }
-
- /**
- * Returns the element at the specified position in the receiver; WARNING: Does not check preconditions.
- * Provided with invalid parameters this method may return invalid elements without throwing any exception! You
- * should only use this method when you are absolutely sure that the index is within bounds. Precondition
- * (unchecked): <tt>index >= 0 && index < size()</tt>.
- *
- * This method is normally only used internally in large loops where bounds are explicitly checked before the loop and
- * need no be rechecked within the loop. However, when desperately, you can give this method <tt>public</tt>
- * visibility in subclasses.
- *
- * @param index index of element to return.
- */
- protected abstract ${valueType} getQuick(int index);
-
- /**
- * Returns the index of the first occurrence of the specified element. Returns <code>-1</code> if the receiver does
- * not contain this element.
- *
- * @param element the element to be searched for.
- * @return the index of the first occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- */
- public int indexOf(${valueType} element) { //delta
- return indexOfFromTo(element, 0, size - 1);
- }
-
- /**
- * Returns the index of the first occurrence of the specified element. Returns <code>-1</code> if the receiver does
- * not contain this element. Searches between <code>from</code>, inclusive and <code>to</code>, inclusive. Tests for
- * identity.
- *
- * @param element element to search for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return the index of the first occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- public int indexOfFromTo(${valueType} element, int from, int to) {
- checkRangeFromTo(from, to, size);
-
- for (int i = from; i <= to; i++) {
- if (element == getQuick(i)) {
- return i;
- } //found
- }
- return -1; //not found
- }
-
- /**
- * Returns the index of the last occurrence of the specified element. Returns <code>-1</code> if the receiver does not
- * contain this element.
- *
- * @param element the element to be searched for.
- * @return the index of the last occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- */
- public int lastIndexOf(${valueType} element) {
- return lastIndexOfFromTo(element, 0, size - 1);
- }
-
- /**
- * Returns the index of the last occurrence of the specified element. Returns <code>-1</code> if the receiver does not
- * contain this element. Searches beginning at <code>to</code>, inclusive until <code>from</code>, inclusive. Tests
- * for identity.
- *
- * @param element element to search for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return the index of the last occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- public int lastIndexOfFromTo(${valueType} element, int from, int to) {
- checkRangeFromTo(from, to, size());
-
- for (int i = to; i >= from; i--) {
- if (element == getQuick(i)) {
- return i;
- } //found
- }
- return -1; //not found
- }
-
- /**
- * Sorts the specified range of the receiver into ascending order.
- *
- * The sorting algorithm is a modified mergesort (in which the merge is omitted if the highest element in the low
- * sublist is less than the lowest element in the high sublist). This algorithm offers guaranteed n*log(n)
- * performance, and can approach linear performance on nearly sorted lists.
- *
- * You should never call this method unless you are sure that this particular sorting algorithm is the right one
- * for your data set. It is generally better to call <tt>sort()</tt> or <tt>sortFromTo(...)</tt> instead, because
- * those methods automatically choose the best sorting algorithm.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @Override
- public void mergeSortFromTo(int from, int to) {
- int mySize = size();
- checkRangeFromTo(from, to, mySize);
-
- ${valueType}[] myElements = elements();
- Sorting.mergeSort(myElements, from, to + 1);
- elements(myElements);
- setSizeRaw(mySize);
- }
-
- /**
- * Sorts the receiver according to the order induced by the specified comparator. All elements in the range must be
- * mutually comparable by the specified comparator (that is, <tt>c.compare(e1, e2)</tt> must not throw a
- * <tt>ClassCastException</tt> for any elements <tt>e1</tt> and <tt>e2</tt> in the range).
- *
- * This sort is guaranteed to be stable: equal elements will not be reordered as a result of the sort.
- *
- * The sorting algorithm is a modified mergesort (in which the merge is omitted if the highest element in the low
- * sublist is less than the lowest element in the high sublist). This algorithm offers guaranteed n*log(n)
- * performance, and can approach linear performance on nearly sorted lists.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @param c the comparator to determine the order of the receiver.
- * @throws ClassCastException if the array contains elements that are not mutually comparable using
- * the specified comparator.
- * @throws IllegalArgumentException if <tt>fromIndex > toIndex</tt>
- * @throws ArrayIndexOutOfBoundsException if <tt>fromIndex < 0</tt> or <tt>toIndex > a.length</tt>
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- public void mergeSortFromTo(int from, int to, ${valueTypeCap}Comparator c) {
- int mySize = size();
- checkRangeFromTo(from, to, mySize);
-
- ${valueType}[] myElements = elements();
- Sorting.mergeSort(myElements, from, to + 1, c);
- elements(myElements);
- setSizeRaw(mySize);
- }
-
- /**
- * Returns a new list of the part of the receiver between <code>from</code>, inclusive, and <code>to</code>,
- * inclusive.
- *
- * @param from the index of the first element (inclusive).
- * @param to the index of the last element (inclusive).
- * @return a new list
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- public Abstract${valueTypeCap}List partFromTo(int from, int to) {
- checkRangeFromTo(from, to, size);
-
- int length = to - from + 1;
- ${valueTypeCap}ArrayList part = new ${valueTypeCap}ArrayList(length);
- part.addAllOfFromTo(this, from, to);
- return part;
- }
-
- /**
- * Sorts the specified range of the receiver into ascending numerical order. The sorting algorithm is a tuned
- * quicksort, adapted from Jon L. Bentley and M. Douglas McIlroy's "Engineering a Sort Function", Software-Practice
- * and Experience, Vol. 23(11) P. 1249-1265 (November 1993). This algorithm offers n*log(n) performance on many data
- * sets that cause other quicksorts to degrade to quadratic performance.
- *
- * You should never call this method unless you are sure that this particular sorting algorithm is the right one
- * for your data set. It is generally better to call <tt>sort()</tt> or <tt>sortFromTo(...)</tt> instead, because
- * those methods automatically choose the best sorting algorithm.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @Override
- public void quickSortFromTo(int from, int to) {
- int mySize = size();
- checkRangeFromTo(from, to, mySize);
-
- ${valueType}[] myElements = elements();
- java.util.Arrays.sort(myElements, from, to + 1);
- elements(myElements);
- setSizeRaw(mySize);
- }
-
- /**
- * Sorts the receiver according to the order induced by the specified comparator. All elements in the range must be
- * mutually comparable by the specified comparator (that is, <tt>c.compare(e1, e2)</tt> must not throw a
- * <tt>ClassCastException</tt> for any elements <tt>e1</tt> and <tt>e2</tt> in the range).
- *
- * The sorting algorithm is a tuned quicksort, adapted from Jon L. Bentley and M. Douglas McIlroy's "Engineering a
- * Sort Function", Software-Practice and Experience, Vol. 23(11) P. 1249-1265 (November 1993). This algorithm offers
- * n*log(n) performance on many data sets that cause other quicksorts to degrade to quadratic performance.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @param c the comparator to determine the order of the receiver.
- * @throws ClassCastException if the array contains elements that are not mutually comparable using
- * the specified comparator.
- * @throws IllegalArgumentException if <tt>fromIndex > toIndex</tt>
- * @throws ArrayIndexOutOfBoundsException if <tt>fromIndex < 0</tt> or <tt>toIndex > a.length</tt>
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- public void quickSortFromTo(int from, int to, ${valueTypeCap}Comparator c) {
- int mySize = size();
- checkRangeFromTo(from, to, mySize);
-
- ${valueType}[] myElements = elements();
- Sorting.quickSort(myElements, from, to + 1, c);
- elements(myElements);
- setSizeRaw(mySize);
- }
-
- /**
- * Removes from the receiver all elements that are contained in the specified list. Tests for identity.
- *
- * @param other the other list.
- * @return <code>true</code> if the receiver changed as a result of the call.
- */
- public boolean removeAll(Abstract${valueTypeCap}List other) {
- if (other.isEmpty()) {
- return false;
- } //nothing to do
- int limit = other.size() - 1;
- int j = 0;
-
- for (int i = 0; i < size; i++) {
- if (other.indexOfFromTo(getQuick(i), 0, limit) < 0) {
- setQuick(j++, getQuick(i));
- }
- }
-
- boolean modified = (j != size);
- setSize(j);
- return modified;
- }
-
- /**
- * Removes from the receiver all elements whose index is between <code>from</code>, inclusive and <code>to</code>,
- * inclusive. Shifts any succeeding elements to the left (reduces their index). This call shortens the list by
- * <tt>(to - from + 1)</tt> elements.
- *
- * @param from index of first element to be removed.
- * @param to index of last element to be removed.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @Override
- public void removeFromTo(int from, int to) {
- checkRangeFromTo(from, to, size);
- int numMoved = size - to - 1;
- if (numMoved > 0) {
- replaceFromToWithFrom(from, from - 1 + numMoved, this, to + 1);
- //fillFromToWith(from+numMoved, size-1, 0.0f); //delta
- }
- int width = to - from + 1;
- if (width > 0) {
- setSizeRaw(size - width);
- }
- }
-
- /**
- * Replaces a number of elements in the receiver with the same number of elements of another list. Replaces elements
- * in the receiver, between <code>from</code> (inclusive) and <code>to</code> (inclusive), with elements of
- * <code>other</code>, starting from <code>otherFrom</code> (inclusive).
- *
- * @param from the position of the first element to be replaced in the receiver
- * @param to the position of the last element to be replaced in the receiver
- * @param other list holding elements to be copied into the receiver.
- * @param otherFrom position of first element within other list to be copied.
- */
- public void replaceFromToWithFrom(int from, int to, Abstract${valueTypeCap}List other, int otherFrom) {
- int length = to - from + 1;
- if (length > 0) {
- checkRangeFromTo(from, to, size());
- checkRangeFromTo(otherFrom, otherFrom + length - 1, other.size());
-
- // unambiguous copy (it may hold other==this)
- if (from <= otherFrom) {
- while (--length >= 0) {
- setQuick(from++, other.getQuick(otherFrom++));
- }
- } else {
- int otherTo = otherFrom + length - 1;
- while (--length >= 0) {
- setQuick(to--, other.getQuick(otherTo--));
- }
- }
- }
- }
-
- /**
- * Replaces the part between <code>from</code> (inclusive) and <code>to</code> (inclusive) with the other list's part
- * between <code>otherFrom</code> and <code>otherTo</code>. Powerful (and tricky) method! Both parts need not be of
- * the same size (part A can both be smaller or larger than part B). Parts may overlap. Receiver and other list may
- * (but most not) be identical. If <code>from > to</code>, then inserts other part before <code>from</code>.
- *
- * @param from the first element of the receiver (inclusive)
- * @param to the last element of the receiver (inclusive)
- * @param other the other list (may be identical with receiver)
- * @param otherFrom the first element of the other list (inclusive)
- * @param otherTo the last element of the other list (inclusive)
- *
- * Examples:<pre>
- * a=[0, 1, 2, 3, 4, 5, 6, 7]
- * b=[50, 60, 70, 80, 90]
- * a.R(...)=a.replaceFromToWithFromTo(...)
- *
- * a.R(3,5,b,0,4)-->[0, 1, 2, 50, 60, 70, 80, 90,
- * 6, 7]
- * a.R(1,6,b,0,4)-->[0, 50, 60, 70, 80, 90, 7]
- * a.R(0,6,b,0,4)-->[50, 60, 70, 80, 90, 7]
- * a.R(3,5,b,1,2)-->[0, 1, 2, 60, 70, 6, 7]
- * a.R(1,6,b,1,2)-->[0, 60, 70, 7]
- * a.R(0,6,b,1,2)-->[60, 70, 7]
- * a.R(5,3,b,0,4)-->[0, 1, 2, 3, 4, 50, 60, 70,
- * 80, 90, 5, 6, 7]
- * a.R(5,0,b,0,4)-->[0, 1, 2, 3, 4, 50, 60, 70,
- * 80, 90, 5, 6, 7]
- * a.R(5,3,b,1,2)-->[0, 1, 2, 3, 4, 60, 70, 5, 6,
- * 7]
- * a.R(5,0,b,1,2)-->[0, 1, 2, 3, 4, 60, 70, 5, 6,
- * 7]
- *
- * Extreme cases:
- * a.R(5,3,b,0,0)-->[0, 1, 2, 3, 4, 50, 5, 6, 7]
- * a.R(5,3,b,4,4)-->[0, 1, 2, 3, 4, 90, 5, 6, 7]
- * a.R(3,5,a,0,1)-->[0, 1, 2, 0, 1, 6, 7]
- * a.R(3,5,a,3,5)-->[0, 1, 2, 3, 4, 5, 6, 7]
- * a.R(3,5,a,4,4)-->[0, 1, 2, 4, 6, 7]
- * a.R(5,3,a,0,4)-->[0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
- * 5, 6, 7]
- * a.R(0,-1,b,0,4)-->[50, 60, 70, 80, 90, 0, 1, 2,
- * 3, 4, 5, 6, 7]
- * a.R(0,-1,a,0,4)-->[0, 1, 2, 3, 4, 0, 1, 2, 3,
- * 4, 5, 6, 7]
- * a.R(8,0,a,0,4)-->[0, 1, 2, 3, 4, 5, 6, 7, 0, 1,
- * 2, 3, 4]
- * </pre>
- */
- public void replaceFromToWithFromTo(int from, int to, Abstract${valueTypeCap}List other, int otherFrom, int otherTo) {
- if (otherFrom > otherTo) {
- throw new IndexOutOfBoundsException("otherFrom: " + otherFrom + ", otherTo: " + otherTo);
- }
-
- if (this == other && to - from != otherTo - otherFrom) { // avoid stumbling over my own feet
- replaceFromToWithFromTo(from, to, partFromTo(otherFrom, otherTo), 0, otherTo - otherFrom);
- return;
- }
-
- int length = otherTo - otherFrom + 1;
- int diff = length;
- int theLast = from - 1;
-
- if (to >= from) {
- diff -= (to - from + 1);
- theLast = to;
- }
-
- if (diff > 0) {
- beforeInsertDummies(theLast + 1, diff);
- } else {
- if (diff < 0) {
- removeFromTo(theLast + diff, theLast - 1);
- }
- }
-
- if (length > 0) {
- replaceFromToWithFrom(from, from + length - 1, other, otherFrom);
- }
- }
-
- /**
- * Retains (keeps) only the elements in the receiver that are contained in the specified other list. In other words,
- * removes from the receiver all of its elements that are not contained in the specified other list.
- *
- * @param other the other list to test against.
- * @return <code>true</code> if the receiver changed as a result of the call.
- */
- public boolean retainAll(Abstract${valueTypeCap}List other) {
- if (other.isEmpty()) {
- if (size == 0) {
- return false;
- }
- setSize(0);
- return true;
- }
-
- int limit = other.size() - 1;
- int j = 0;
- for (int i = 0; i < size; i++) {
- if (other.indexOfFromTo(getQuick(i), 0, limit) >= 0) {
- setQuick(j++, getQuick(i));
- }
- }
-
- boolean modified = (j != size);
- setSize(j);
- return modified;
- }
-
- /** Reverses the elements of the receiver. Last becomes first, second last becomes second first, and so on. */
- @Override
- public void reverse() {
- int limit = size() / 2;
- int j = size() - 1;
-
- for (int i = 0; i < limit;) { //swap
- ${valueType} tmp = getQuick(i);
- setQuick(i++, getQuick(j));
- setQuick(j--, tmp);
- }
- }
-
- /**
- * Replaces the element at the specified position in the receiver with the specified element.
- *
- * @param index index of element to replace.
- * @param element element to be stored at the specified position.
- * @throws IndexOutOfBoundsException if <tt>index < 0 || index >= size()</tt>.
- */
- public void set(int index, ${valueType} element) {
- if (index >= size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- setQuick(index, element);
- }
-
- /**
- * Replaces the element at the specified position in the receiver with the specified element; WARNING: Does not
- * check preconditions. Provided with invalid parameters this method may access invalid indexes without throwing any
- * exception! You should only use this method when you are absolutely sure that the index is within bounds.
- * Precondition (unchecked): <tt>index >= 0 && index < size()</tt>.
- *
- * This method is normally only used internally in large loops where bounds are explicitly checked before the loop and
- * need no be rechecked within the loop. However, when desperately, you can give this method <tt>public</tt>
- * visibility in subclasses.
- *
- * @param index index of element to replace.
- * @param element element to be stored at the specified position.
- */
- protected abstract void setQuick(int index, ${valueType} element);
-
- /**
- * Sets the size of the receiver without modifying it otherwise. This method should not release or allocate new memory
- * but simply set some instance variable like <tt>size</tt>.
- *
- * If your subclass overrides and delegates size changing methods to some other object, you must make sure that those
- * overriding methods not only update the size of the delegate but also of this class. For example: public
- * DatabaseList extends Abstract${valueTypeCap}List { ... public void removeFromTo(int from,int to) {
- * myDatabase.removeFromTo(from,to); this.setSizeRaw(size-(to-from+1)); } }
- */
- protected void setSizeRaw(int newSize) {
- size = newSize;
- }
-
- /** Returns the number of elements contained in the receiver. */
- @Override
- public int size() {
- return size;
- }
-
- /**
- * Returns a list which is a concatenation of <code>times</code> times the receiver.
- *
- * @param times the number of times the receiver shall be copied.
- */
- public Abstract${valueTypeCap}List times(int times) {
- Abstract${valueTypeCap}List newList = new ${valueTypeCap}ArrayList(times * size());
- for (int i = times; --i >= 0;) {
- newList.addAllOfFromTo(this, 0, size() - 1);
- }
- return newList;
- }
-
- /** Returns a <code>ArrayList</code> containing all the elements in the receiver. */
- public List<${valueObjectType}> toList() {
- int mySize = size();
- List<${valueObjectType}> list = new ArrayList<${valueObjectType}>(mySize);
- for (int i = 0; i < mySize; i++) {
- list.add(get(i));
- }
- return list;
- }
-
- public ${valueType}[] toArray(${valueType}[] values) {
- int mySize = size();
- ${valueType}[] myElements;
- if (values.length >= mySize) {
- myElements = values;
- } else {
- myElements = new ${valueType}[mySize];
- }
- for (int i = size; --i >= 0;) {
- myElements[i] = getQuick(i);
- }
- return myElements;
- }
-
- /** Returns a string representation of the receiver, containing the String representation of each element. */
- public String toString() {
- return org.apache.mahout.math.Arrays.toString(partFromTo(0, size() - 1).elements());
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/list/ValueTypeArrayList.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/list/ValueTypeArrayList.java.t b/math/src/main/java-templates/org/apache/mahout/math/list/ValueTypeArrayList.java.t
deleted file mode 100644
index af14c0f..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/list/ValueTypeArrayList.java.t
+++ /dev/null
@@ -1,659 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.list;
-
-import org.apache.mahout.math.function.${valueTypeCap}Procedure;
-
-/**
- Resizable list holding <code>${valueType}</code> elements; implemented with arrays.
-*/
-
-public class ${valueTypeCap}ArrayList extends Abstract${valueTypeCap}List implements Cloneable {
-
- /**
- * The array buffer into which the elements of the list are stored. The capacity of the list is the length of this
- * array buffer.
- */
- private ${valueType}[] elements;
-
- /** Constructs an empty list. */
- public ${valueTypeCap}ArrayList() {
- this(10);
- }
-
- /**
- * Constructs a list containing the specified elements. The initial size and capacity of the list is the length of the
- * array.
- *
- * WARNING: For efficiency reasons and to keep memory usage low, the array is not copied. So if
- * subsequently you modify the specified array directly via the [] operator, be sure you know what you're doing.
- *
- * @param elements the array to be backed by the the constructed list
- */
- public ${valueTypeCap}ArrayList(${valueType}[] elements) {
- elements(elements);
- }
-
- /**
- * Constructs an empty list with the specified initial capacity.
- *
- * @param initialCapacity the number of elements the receiver can hold without auto-expanding itself by allocating new
- * internal memory.
- */
- public ${valueTypeCap}ArrayList(int initialCapacity) {
- this(new ${valueType}[initialCapacity]);
- setSizeRaw(0);
- }
-
- /**
- * Appends the specified element to the end of this list.
- *
- * @param element element to be appended to this list.
- */
- public void add(${valueType} element) {
- // overridden for performance only.
- if (size == elements.length) {
- ensureCapacity(size + 1);
- }
- elements[size++] = element;
- }
-
- /**
- * Inserts the specified element before the specified position into the receiver. Shifts the element currently at that
- * position (if any) and any subsequent elements to the right.
- *
- * @param index index before which the specified element is to be inserted (must be in [0,size]).
- * @param element element to be inserted.
- * @throws IndexOutOfBoundsException index is out of range (<tt>index < 0 || index > size()</tt>).
- */
- public void beforeInsert(int index, ${valueType} element) {
- // overridden for performance only.
- if (size == index) {
- add(element);
- return;
- }
- if (index > size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- ensureCapacity(size + 1);
- System.arraycopy(elements, index, elements, index + 1, size - index);
- elements[index] = element;
- size++;
- }
-
- /**
- * Searches the receiver for the specified value using the binary search algorithm. The receiver must
- * must be sorted (as by the sort method) prior to making this call. If it is not sorted, the
- * results are undefined: in particular, the call may enter an infinite loop. If the receiver contains multiple
- * elements equal to the specified object, there is no guarantee which instance will be found.
- *
- * @param key the value to be searched for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return index of the search key, if it is contained in the receiver; otherwise, <tt>(-(insertion point) -
- * 1)</tt>. The insertion point is defined as the the point at which the value would be inserted into
- * the receiver: the index of the first element greater than the key, or <tt>receiver.size()</tt>, if all
- * elements in the receiver are less than the specified key. Note that this guarantees that the return value
- * will be >= 0 if and only if the key is found.
- * @see org.apache.mahout.math.BinarySearch
- * @see java.util.Arrays
- */
- @Override
- public int binarySearchFromTo(${valueType} key, int from, int to) {
- return org.apache.mahout.math.BinarySearch.binarySearchFromTo(elements, key, from, to);
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @Override
- public Object clone() {
- // overridden for performance only.
- ${valueTypeCap}ArrayList clone = new ${valueTypeCap}ArrayList(elements.clone());
- clone.setSizeRaw(size);
- return clone;
- }
-
- /**
- * Returns a deep copy of the receiver; uses <code>clone()</code> and casts the result.
- *
- * @return a deep copy of the receiver.
- */
- public ${valueTypeCap}ArrayList copy() {
- return (${valueTypeCap}ArrayList) clone();
- }
-
- #if ($valueType != 'float' && $valueType != 'double' && $valueType != 'long')
- /**
- * Sorts the specified range of the receiver into ascending numerical order.
- *
- * The sorting algorithm is a count sort. This algorithm offers guaranteed <dt>Performance: O(Max(n,max-min+1)).
- * <dt>Space requirements: int[max-min+1] buffer. This algorithm is only applicable if max-min+1 is not large! But
- * if applicable, it usually outperforms quicksort by a factor of 3-4.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @param min the smallest element contained in the range.
- * @param max the largest element contained in the range.
- */
- protected void countSortFromTo(int from, int to, ${valueType} min, ${valueType} max) {
- if (size == 0) {
- return;
- }
- checkRangeFromTo(from, to, size);
-
- ${valueType} width = (${valueType})(max - min + 1);
-
- int[] counts = new int[width];
- ${valueType}[] theElements = elements;
- for (int i = from; i <= to;) {
- counts[(theElements[i++] - min)]++;
- }
-
- int fromIndex = from;
- ${valueType} val = min;
- for (int i = 0; i < width; i++, val++) {
- int c = counts[i];
- if (c > 0) {
- if (c == 1) {
- theElements[fromIndex++] = val;
- } else {
- int toIndex = fromIndex + c - 1;
- fillFromToWith(fromIndex, toIndex, val);
- fromIndex = toIndex + 1;
- }
- }
- }
- }
- #end
-
- /**
- * Returns the elements currently stored, including invalid elements between size and capacity, if any.
- *
- * WARNING: For efficiency reasons and to keep memory usage low, the array is not copied. So if
- * subsequently you modify the returned array directly via the [] operator, be sure you know what you're doing.
- *
- * @return the elements currently stored.
- */
- public ${valueType}[] elements() {
- return elements;
- }
-
- /**
- * Sets the receiver's elements to be the specified array (not a copy of it).
- *
- * The size and capacity of the list is the length of the array. WARNING: For efficiency reasons and to keep
- * memory usage low, the array is not copied. So if subsequently you modify the specified array directly via
- * the [] operator, be sure you know what you're doing.
- *
- * @param elements the new elements to be stored.
- * @return the receiver itself.
- */
- public final Abstract${valueTypeCap}List elements(${valueType}[] elements) {
- this.elements = elements;
- this.size = elements.length;
- return this;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of elements without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public void ensureCapacity(int minCapacity) {
- elements = org.apache.mahout.math.Arrays.ensureCapacity(elements, minCapacity);
- }
-
- /**
- * Compares the specified Object with the receiver. Returns true if and only if the specified Object is also an
- * ArrayList of the same type, both Lists have the same size, and all corresponding pairs of elements in the two Lists
- * are identical. In other words, two Lists are defined to be equal if they contain the same elements in the same
- * order.
- *
- * @param otherObj the Object to be compared for equality with the receiver.
- * @return true if the specified Object is equal to the receiver.
- */
- public boolean equals(Object otherObj) { //delta
- if (otherObj == null) {
- return false;
- }
- // overridden for performance only.
- if (!(otherObj instanceof ${valueTypeCap}ArrayList)) {
- return super.equals(otherObj);
- }
- if (this == otherObj) {
- return true;
- }
- ${valueTypeCap}ArrayList other = (${valueTypeCap}ArrayList) otherObj;
- if (size() != other.size()) {
- return false;
- }
-
- ${valueType}[] theElements = elements();
- ${valueType}[] otherElements = other.elements();
- for (int i = size(); --i >= 0;) {
- if (theElements[i] != otherElements[i]) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Applies a procedure to each element of the receiver, if any. Starts at index 0, moving rightwards.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all elements where iterated over, <tt>true</tt> otherwise.
- */
- public boolean forEach(${valueTypeCap}Procedure procedure) {
- // overridden for performance only.
- ${valueType}[] theElements = elements;
- int theSize = size;
-
- for (int i = 0; i < theSize;) {
- if (!procedure.apply(theElements[i++])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the element at the specified position in the receiver.
- *
- * @param index index of element to return.
- * @throws IndexOutOfBoundsException index is out of range (index < 0 || index >= size()).
- */
- public ${valueType} get(int index) {
- // overridden for performance only.
- if (index >= size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- return elements[index];
- }
-
- /**
- * Returns the element at the specified position in the receiver; WARNING: Does not check preconditions.
- * Provided with invalid parameters this method may return invalid elements without throwing any exception! You
- * should only use this method when you are absolutely sure that the index is within bounds. Precondition
- * (unchecked): <tt>index >= 0 && index < size()</tt>.
- *
- * @param index index of element to return.
- */
- @Override
- public ${valueType} getQuick(int index) {
- return elements[index];
- }
-
- /**
- * Returns the index of the first occurrence of the specified element. Returns <code>-1</code> if the receiver does
- * not contain this element. Searches between <code>from</code>, inclusive and <code>to</code>, inclusive. Tests for
- * identity.
- *
- * @param element element to search for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return the index of the first occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @Override
- public int indexOfFromTo(${valueType} element, int from, int to) {
- // overridden for performance only.
- if (size == 0) {
- return -1;
- }
- checkRangeFromTo(from, to, size);
-
- ${valueType}[] theElements = elements;
- for (int i = from; i <= to; i++) {
- if (element == theElements[i]) {
- return i;
- } //found
- }
- return -1; //not found
- }
-
- /**
- * Returns the index of the last occurrence of the specified element. Returns <code>-1</code> if the receiver does not
- * contain this element. Searches beginning at <code>to</code>, inclusive until <code>from</code>, inclusive. Tests
- * for identity.
- *
- * @param element element to search for.
- * @param from the leftmost search position, inclusive.
- * @param to the rightmost search position, inclusive.
- * @return the index of the last occurrence of the element in the receiver; returns <code>-1</code> if the element is
- * not found.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @Override
- public int lastIndexOfFromTo(${valueType} element, int from, int to) {
- // overridden for performance only.
- if (size == 0) {
- return -1;
- }
- checkRangeFromTo(from, to, size);
-
- ${valueType}[] theElements = elements;
- for (int i = to; i >= from; i--) {
- if (element == theElements[i]) {
- return i;
- } //found
- }
- return -1; //not found
- }
-
- /**
- * Returns a new list of the part of the receiver between <code>from</code>, inclusive, and <code>to</code>,
- * inclusive.
- *
- * @param from the index of the first element (inclusive).
- * @param to the index of the last element (inclusive).
- * @return a new list
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @Override
- public Abstract${valueTypeCap}List partFromTo(int from, int to) {
- if (size == 0) {
- return new ${valueTypeCap}ArrayList(0);
- }
-
- checkRangeFromTo(from, to, size);
-
- ${valueType}[] part = new ${valueType}[to - from + 1];
- System.arraycopy(elements, from, part, 0, to - from + 1);
- return new ${valueTypeCap}ArrayList(part);
- }
-
- /**
- * Removes from the receiver all elements that are contained in the specified list. Tests for identity.
- *
- * @param other the other list.
- * @return <code>true</code> if the receiver changed as a result of the call.
- */
- @Override
- public boolean removeAll(Abstract${valueTypeCap}List other) {
- // overridden for performance only.
- if (!(other instanceof ${valueTypeCap}ArrayList)) {
- return super.removeAll(other);
- }
-
- /* There are two possibilities to do the thing
- a) use other.indexOf(...)
- b) sort other, then use other.binarySearch(...)
-
- Let's try to figure out which one is faster. Let M=size, N=other.size, then
- a) takes O(M*N) steps
- b) takes O(N*logN + M*logN) steps (sorting is O(N*logN) and binarySearch is O(logN))
-
- Hence, if N*logN + M*logN < M*N, we use b) otherwise we use a).
- */
- if (other.isEmpty()) {
- return false;
- } //nothing to do
- int limit = other.size() - 1;
- int j = 0;
- ${valueType}[] theElements = elements;
- int mySize = size();
-
- double N = (double) other.size();
- double M = (double) mySize;
- if ((N + M) * org.apache.mahout.collections.Arithmetic.log2(N) < M * N) {
- // it is faster to sort other before searching in it
- ${valueTypeCap}ArrayList sortedList = (${valueTypeCap}ArrayList) other.clone();
- sortedList.quickSort();
-
- for (int i = 0; i < mySize; i++) {
- if (sortedList.binarySearchFromTo(theElements[i], 0, limit) < 0) {
- theElements[j++] = theElements[i];
- }
- }
- } else {
- // it is faster to search in other without sorting
- for (int i = 0; i < mySize; i++) {
- if (other.indexOfFromTo(theElements[i], 0, limit) < 0) {
- theElements[j++] = theElements[i];
- }
- }
- }
-
- boolean modified = (j != mySize);
- setSize(j);
- return modified;
- }
-
- /**
- * Replaces a number of elements in the receiver with the same number of elements of another list. Replaces elements
- * in the receiver, between <code>from</code> (inclusive) and <code>to</code> (inclusive), with elements of
- * <code>other</code>, starting from <code>otherFrom</code> (inclusive).
- *
- * @param from the position of the first element to be replaced in the receiver
- * @param to the position of the last element to be replaced in the receiver
- * @param other list holding elements to be copied into the receiver.
- * @param otherFrom position of first element within other list to be copied.
- */
- @Override
- public void replaceFromToWithFrom(int from, int to, Abstract${valueTypeCap}List other, int otherFrom) {
- // overridden for performance only.
- if (!(other instanceof ${valueTypeCap}ArrayList)) {
- // slower
- super.replaceFromToWithFrom(from, to, other, otherFrom);
- return;
- }
- int length = to - from + 1;
- if (length > 0) {
- checkRangeFromTo(from, to, size());
- checkRangeFromTo(otherFrom, otherFrom + length - 1, other.size());
- System.arraycopy(((${valueTypeCap}ArrayList) other).elements, otherFrom, elements, from, length);
- }
- }
-
- /**
- * Retains (keeps) only the elements in the receiver that are contained in the specified other list. In other words,
- * removes from the receiver all of its elements that are not contained in the specified other list.
- *
- * @param other the other list to test against.
- * @return <code>true</code> if the receiver changed as a result of the call.
- */
- @Override
- public boolean retainAll(Abstract${valueTypeCap}List other) {
- // overridden for performance only.
- if (!(other instanceof ${valueTypeCap}ArrayList)) {
- return super.retainAll(other);
- }
-
- /* There are two possibilities to do the thing
- a) use other.indexOf(...)
- b) sort other, then use other.binarySearch(...)
-
- Let's try to figure out which one is faster. Let M=size, N=other.size, then
- a) takes O(M*N) steps
- b) takes O(N*logN + M*logN) steps (sorting is O(N*logN) and binarySearch is O(logN))
-
- Hence, if N*logN + M*logN < M*N, we use b) otherwise we use a).
- */
- int limit = other.size() - 1;
- int j = 0;
- ${valueType}[] theElements = elements;
- int mySize = size();
-
- double N = (double) other.size();
- double M = (double) mySize;
- if ((N + M) * org.apache.mahout.collections.Arithmetic.log2(N) < M * N) {
- // it is faster to sort other before searching in it
- ${valueTypeCap}ArrayList sortedList = (${valueTypeCap}ArrayList) other.clone();
- sortedList.quickSort();
-
- for (int i = 0; i < mySize; i++) {
- if (sortedList.binarySearchFromTo(theElements[i], 0, limit) >= 0) {
- theElements[j++] = theElements[i];
- }
- }
- } else {
- // it is faster to search in other without sorting
- for (int i = 0; i < mySize; i++) {
- if (other.indexOfFromTo(theElements[i], 0, limit) >= 0) {
- theElements[j++] = theElements[i];
- }
- }
- }
-
- boolean modified = (j != mySize);
- setSize(j);
- return modified;
- }
-
- /** Reverses the elements of the receiver. Last becomes first, second last becomes second first, and so on. */
- @Override
- public void reverse() {
- // overridden for performance only.
- int limit = size / 2;
- int j = size - 1;
-
- ${valueType}[] theElements = elements;
- for (int i = 0; i < limit;) { //swap
- ${valueType} tmp = theElements[i];
- theElements[i++] = theElements[j];
- theElements[j--] = tmp;
- }
- }
-
- /**
- * Replaces the element at the specified position in the receiver with the specified element.
- *
- * @param index index of element to replace.
- * @param element element to be stored at the specified position.
- * @throws IndexOutOfBoundsException index is out of range (index < 0 || index >= size()).
- */
- @Override
- public void set(int index, ${valueType} element) {
- // overridden for performance only.
- if (index >= size || index < 0) {
- throw new IndexOutOfBoundsException("Index: " + index + ", Size: " + size);
- }
- elements[index] = element;
- }
-
- /**
- * Replaces the element at the specified position in the receiver with the specified element; WARNING: Does not
- * check preconditions. Provided with invalid parameters this method may access invalid indexes without throwing any
- * exception! You should only use this method when you are absolutely sure that the index is within bounds.
- * Precondition (unchecked): <tt>index >= 0 && index < size()</tt>.
- *
- * @param index index of element to replace.
- * @param element element to be stored at the specified position.
- */
- @Override
- public void setQuick(int index, ${valueType} element) {
- elements[index] = element;
- }
-
- /**
- * Randomly permutes the part of the receiver between <code>from</code> (inclusive) and <code>to</code> (inclusive).
- *
- * @param from the index of the first element (inclusive) to be permuted.
- * @param to the index of the last element (inclusive) to be permuted.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
-
- /**
- * Sorts the specified range of the receiver into ascending order.
- *
- * The sorting algorithm is dynamically chosen according to the characteristics of the data set. Currently quicksort
- * and countsort are considered. Countsort is not always applicable, but if applicable, it usually outperforms
- * quicksort by a factor of 3-4.
- *
- * Best case performance: O(N). <dt>Worst case performance: O(N^2) (a degenerated quicksort). <dt>Best case space
- * requirements: 0 KB. <dt>Worst case space requirements: 40 KB.
- *
- * @param from the index of the first element (inclusive) to be sorted.
- * @param to the index of the last element (inclusive) to be sorted.
- * @throws IndexOutOfBoundsException index is out of range (<tt>size()>0 && (from<0 || from>to ||
- * to>=size())</tt>).
- */
- @Override
- public void sortFromTo(int from, int to) {
- /*
- * Computes min and max and decides on this basis.
- * In practice the additional overhead is very small compared to the potential gains.
- */
-
- if (size == 0) {
- return;
- }
- checkRangeFromTo(from, to, size);
-
- // determine minimum and maximum.
- ${valueType} min = elements[from];
- ${valueType} max = elements[from];
-
- ${valueType}[] theElements = elements;
- for (int i = from + 1; i <= to;) {
- ${valueType} elem = theElements[i++];
- if (elem > max) {
- max = elem;
- } else if (elem < min) {
- min = elem;
- }
- }
-
- #if ($valueType == 'byte' || $valueType == 'char' || $valueType == 'int')
- // try to figure out which option is fastest.
- double N = (double) to - (double) from + 1.0;
- double quickSortEstimate = N * Math.log(N) / 0.6931471805599453; // O(N*log(N,base=2)) ; ln(2)=0.6931471805599453
-
- double width = (double) max - (double) min + 1.0;
- double countSortEstimate = Math.max(width, N); // O(Max(width,N))
-
- int widthThreshold = 10000; // never consider options resulting in outrageous memory allocations.
- if (width < widthThreshold && countSortEstimate < quickSortEstimate) {
- countSortFromTo(from, to, min, max);
- } else {
- quickSortFromTo(from, to);
- }
- #else
- quickSortFromTo(from, to);
- #end
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- elements = org.apache.mahout.math.Arrays.trimToCapacity(elements, size());
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeObjectMap.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeObjectMap.java.t b/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeObjectMap.java.t
deleted file mode 100644
index 4ffbe3a..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeObjectMap.java.t
+++ /dev/null
@@ -1,467 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.nio.IntBuffer;
-import java.util.Arrays;
-
-import org.apache.mahout.math.Sorting;
-import org.apache.mahout.math.Swapper;
-import org.apache.mahout.math.set.HashUtils;
-import org.apache.mahout.math.function.IntComparator;
-
-import org.apache.mahout.math.function.${keyTypeCap}ObjectProcedure;
-import org.apache.mahout.math.function.${keyTypeCap}Procedure;
-import org.apache.mahout.math.list.${keyTypeCap}ArrayList;
-import org.apache.mahout.math.set.AbstractSet;
-
-public abstract class Abstract${keyTypeCap}ObjectMap<T> extends AbstractSet {
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- public boolean containsKey(final ${keyType} key) {
- return !forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} iterKey) {
- return (key != iterKey);
- }
- }
- );
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified value. Tests for identity.
- *
- * @return <tt>true</tt> if the receiver contains the specified value.
- */
- public boolean containsValue(final T value) {
- return !forEachPair(
- new ${keyTypeCap}ObjectProcedure<T>() {
- @Override
- public boolean apply(${keyType} iterKey, Object iterValue) {
- return (value != iterValue);
- }
- }
- );
- }
-
- /**
- * Returns a deep copy of the receiver; uses <code>clone()</code> and casts the result.
- *
- * @return a deep copy of the receiver.
- */
- @SuppressWarnings("unchecked") // seemingly unavoidable.
- public Abstract${keyTypeCap}ObjectMap<T> copy() {
- return this.getClass().cast(clone());
- }
-
- /**
- * Compares the specified object with this map for equality. Returns <tt>true</tt> if the given object is also a map
- * and the two maps represent the same mappings. More formally, two maps <tt>m1</tt> and <tt>m2</tt> represent the
- * same mappings iff
- * <pre>
- * m1.forEachPair(
- * new ${keyTypeCap}ObjectProcedure() {
- * public boolean apply(${keyType} key, Object value) {
- * return m2.containsKey(key) && m2.get(key) == value;
- * }
- * }
- * )
- * &&
- * m2.forEachPair(
- * new ${keyTypeCap}ObjectProcedure() {
- * public boolean apply(${keyType} key, Object value) {
- * return m1.containsKey(key) && m1.get(key) == value;
- * }
- * }
- * );
- * </pre>
- *
- * This implementation first checks if the specified object is this map; if so it returns <tt>true</tt>. Then, it
- * checks if the specified object is a map whose size is identical to the size of this set; if not, it it returns
- * <tt>false</tt>. If so, it applies the iteration as described above.
- *
- * @param obj object to be compared for equality with this map.
- * @return <tt>true</tt> if the specified object is equal to this map.
- */
- @SuppressWarnings("unchecked") // incompressible
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof Abstract${keyTypeCap}ObjectMap)) {
- return false;
- }
- final Abstract${keyTypeCap}ObjectMap other = (Abstract${keyTypeCap}ObjectMap) obj;
- if (other.size() != size()) {
- return false;
- }
-
- return
- forEachPair(
- new ${keyTypeCap}ObjectProcedure() {
- @Override
- public boolean apply(${keyType} key, Object value) {
- return other.containsKey(key) && other.get(key) == value;
- }
- }
- )
- &&
- other.forEachPair(
- new ${keyTypeCap}ObjectProcedure() {
- @Override
- public boolean apply(${keyType} key, Object value) {
- return containsKey(key) && get(key) == value;
- }
- }
- );
- }
-
- public int hashCode() {
- final int[] buf = new int[size()];
- forEachPair(
- new ${keyTypeCap}ObjectProcedure() {
- int i = 0;
-
- @Override
- public boolean apply(${keyType} key, Object value) {
- buf[i++] = HashUtils.hash(key) ^ value.hashCode();
- return true;
- }
- }
- );
- Arrays.sort(buf);
- return IntBuffer.wrap(buf).hashCode();
- }
-
-
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- public abstract boolean forEachKey(${keyTypeCap}Procedure procedure);
-
- /**
- * Applies a procedure to each (key,value) pair of the receiver, if any. Iteration order is guaranteed to be
- * identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- public boolean forEachPair(final ${keyTypeCap}ObjectProcedure<T> procedure) {
- return forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- return procedure.apply(key, get(key));
- }
- }
- );
- }
-
- /**
- * Returns the value associated with the specified key. It is often a good idea to first check with {@link
- * #containsKey(${keyType})} whether the given key has a value associated or not, i.e. whether there exists an association
- * for the given key or not.
- *
- * @param key the key to be searched for.
- * @return the value associated with the specified key; <tt>null</tt> if no such key is present.
- */
- public abstract T get(${keyType} key);
-
- /**
- * Returns a list filled with all keys contained in the receiver. The returned list has a size that equals
- * <tt>this.size()</tt>. Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(${keyTypeCap}Procedure)}. This method can be used to iterate over the keys of the receiver.
- *
- * @return the keys.
- */
- public ${keyTypeCap}ArrayList keys() {
- ${keyTypeCap}ArrayList list = new ${keyTypeCap}ArrayList(size());
- keys(list);
- return list;
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}. This method can be used to
- * iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- public void keys(final ${keyTypeCap}ArrayList list) {
- list.clear();
- forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- list.add(key);
- return true;
- }
- }
- );
- }
-
- /**
- * Fills all keys sorted ascending by their associated value into the specified list. Fills into the list,
- * starting at index 0. After this call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * Primary sort criterium is "value", secondary sort criterium is "key". This means that if any two values are equal,
- * the smaller key comes first. Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList =
- * (8,6,7)</tt>
- *
- * @param keyList the list to be filled, can have any size.
- */
- public void keysSortedByValue(${keyTypeCap}ArrayList keyList) {
- pairsSortedByValue(keyList, new ArrayList<T>(size()));
- }
-
- /**
- * Fills all pairs satisfying a given condition into the specified lists. Fills into the lists, starting at index 0.
- * After this call returns the specified lists both have a new size, the number of pairs satisfying the condition.
- * Iteration order is guaranteed to be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- * Example: 
- * <pre>
- * ${keyTypeCap}ObjectProcedure condition = new ${keyTypeCap}ObjectProcedure() { // match even keys only
- * public boolean apply(${keyType} key, Object value) { return key%2==0; }
- * }
- * keys = (8,7,6), values = (1,2,2) --> keyList = (6,8), valueList = (2,1)</tt>
- * </pre>
- *
- * @param condition the condition to be matched. Takes the current key as first and the current value as second
- * argument.
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- public void pairsMatching(final ${keyTypeCap}ObjectProcedure<T> condition,
- final ${keyTypeCap}ArrayList keyList,
- final List<T> valueList) {
- keyList.clear();
- valueList.clear();
-
- forEachPair(
- new ${keyTypeCap}ObjectProcedure<T>() {
- @Override
- public boolean apply(${keyType} key, T value) {
- if (condition.apply(key, value)) {
- keyList.add(key);
- valueList.add(value);
- }
- return true;
- }
- }
- );
- }
-
- /**
- * Fills all keys and values sorted ascending by key into the specified lists. Fills into the lists, starting
- * at index 0. After this call returns the specified lists both have a new size that equals <tt>this.size()</tt>. 
- * Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList = (6,7,8), valueList = (2,2,1)</tt>
- *
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- @SuppressWarnings("unchecked")
- public void pairsSortedByKey(${keyTypeCap}ArrayList keyList, List<T> valueList) {
- keys(keyList);
- keyList.sort();
- // the following is straightforward if not the most space-efficient possibility
- Object[] tempValueList = new Object[keyList.size()];
-
- for (int i = keyList.size(); --i >= 0;) {
- tempValueList[i] = get(keyList.getQuick(i));
- }
- valueList.clear();
- for (Object value : tempValueList) {
- valueList.add((T) value);
- }
-
- }
-
- /**
- * Fills all keys and values sorted ascending by value according to natural ordering into the specified lists.
- * Fills into the lists, starting at index 0. After this call returns the specified lists both have a new size that
- * equals <tt>this.size()</tt>. Primary sort criterium is "value", secondary sort criterium is "key". This means that
- * if any two values are equal, the smaller key comes first. Example: <tt>keys = (8,7,6), values =
- * (1,2,2) --> keyList = (8,6,7), valueList = (1,2,2)</tt>
- *
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- @SuppressWarnings("unchecked")
- public void pairsSortedByValue(${keyTypeCap}ArrayList keyList, List<T> valueList) {
- keys(keyList);
- values(valueList);
-
- if (!valueList.isEmpty() && !(valueList.get(0) instanceof Comparable)) {
- throw new UnsupportedOperationException("Cannot sort the values; "
- + valueList.get(0).getClass()
- + " does not implement Comparable");
- }
-
- final ${keyType}[] k = keyList.elements();
- final List<T> valueRef = valueList;
- Swapper swapper = new Swapper() {
- @Override
- public void swap(int a, int b) {
- T t1 = valueRef.get(a);
- valueRef.set(a, valueRef.get(b));
- valueRef.set(b, t1);
- ${keyType} t2 = k[a];
- k[a] = k[b];
- k[b] = t2;
- }
- };
-
- IntComparator comp = new IntComparator() {
- @Override
- public int compare(int a, int b) {
- int ab = ((Comparable)valueRef.get(a)).compareTo(valueRef.get(b));
- return ab < 0 ? -1 : ab > 0 ? 1 : (k[a] < k[b] ? -1 : (k[a] == k[b] ? 0 : 1));
- }
- };
-
- Sorting.quickSort(0, keyList.size(), comp, swapper);
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- public abstract boolean put(${keyType} key, T value);
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- public abstract boolean removeKey(${keyType} key);
-
- /**
- * Returns a string representation of the receiver, containing the String representation of each key-value pair,
- * sorted ascending by key.
- */
- public String toString() {
- ${keyTypeCap}ArrayList theKeys = keys();
- theKeys.sort();
-
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = theKeys.size() - 1;
- for (int i = 0; i <= maxIndex; i++) {
- ${keyType} key = theKeys.get(i);
- buf.append(String.valueOf(key));
- buf.append("->");
- buf.append(String.valueOf(get(key)));
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the receiver, containing the String representation of each key-value pair,
- * sorted ascending by value, according to natural ordering.
- */
- public String toStringByValue() {
- ${keyTypeCap}ArrayList theKeys = new ${keyTypeCap}ArrayList();
- keysSortedByValue(theKeys);
-
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = theKeys.size() - 1;
- for (int i = 0; i <= maxIndex; i++) {
- ${keyType} key = theKeys.get(i);
- buf.append(String.valueOf(key));
- buf.append("->");
- buf.append(String.valueOf(get(key)));
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a list filled with all values contained in the receiver. The returned list has a size that equals
- * <tt>this.size()</tt>. Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(${keyTypeCap}Procedure)}. This method can be used to iterate over the values of the receiver.
- *
- * @return the values.
- */
- public List<T> values() {
- List<T> list = new ArrayList<T>(size());
- values(list);
- return list;
- }
-
- /**
- * Fills all values contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}. This method can be used to
- * iterate over the values of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- public void values(final List<T> list) {
- list.clear();
- forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- list.add(get(key));
- return true;
- }
- }
- );
- }
-}

r***@apache.org

2018-06-27 14:51:41 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java b/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
deleted file mode 100644
index eaaa397..0000000
--- a/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
+++ /dev/null
@@ -1,834 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Maps;
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.math.function.PlusMult;
-import org.apache.mahout.math.function.VectorFunction;
-
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
-/**
- * A few universal implementations of convenience functions for a JVM-backed matrix.
- */
-public abstract class AbstractMatrix implements Matrix {
-
- protected Map<String, Integer> columnLabelBindings;
- protected Map<String, Integer> rowLabelBindings;
- protected int rows;
- protected int columns;
-
- protected AbstractMatrix(int rows, int columns) {
- this.rows = rows;
- this.columns = columns;
- }
-
- @Override
- public int columnSize() {
- return columns;
- }
-
- @Override
- public int rowSize() {
- return rows;
- }
-
- @Override
- public Iterator<MatrixSlice> iterator() {
- return iterateAll();
- }
-
- @Override
- public Iterator<MatrixSlice> iterateAll() {
- return new AbstractIterator<MatrixSlice>() {
- private int row;
-
- @Override
- protected MatrixSlice computeNext() {
- if (row >= numRows()) {
- return endOfData();
- }
- int i = row++;
- return new MatrixSlice(viewRow(i), i);
- }
- };
- }
-
- @Override
- public Iterator<MatrixSlice> iterateNonEmpty() {
- return iterator();
- }
-
- /**
- * Abstracted out for the iterator
- *
- * @return numRows() for row-based iterator, numColumns() for column-based.
- */
- @Override
- public int numSlices() {
- return numRows();
- }
-
- @Override
- public double get(String rowLabel, String columnLabel) {
- if (columnLabelBindings == null || rowLabelBindings == null) {
- throw new IllegalStateException("Unbound label");
- }
- Integer row = rowLabelBindings.get(rowLabel);
- Integer col = columnLabelBindings.get(columnLabel);
- if (row == null || col == null) {
- throw new IllegalStateException("Unbound label");
- }
-
- return get(row, col);
- }
-
- @Override
- public Map<String, Integer> getColumnLabelBindings() {
- return columnLabelBindings;
- }
-
- @Override
- public Map<String, Integer> getRowLabelBindings() {
- return rowLabelBindings;
- }
-
- @Override
- public void set(String rowLabel, double[] rowData) {
- if (columnLabelBindings == null) {
- throw new IllegalStateException("Unbound label");
- }
- Integer row = rowLabelBindings.get(rowLabel);
- if (row == null) {
- throw new IllegalStateException("Unbound label");
- }
- set(row, rowData);
- }
-
- @Override
- public void set(String rowLabel, int row, double[] rowData) {
- if (rowLabelBindings == null) {
- rowLabelBindings = new HashMap<>();
- }
- rowLabelBindings.put(rowLabel, row);
- set(row, rowData);
- }
-
- @Override
- public void set(String rowLabel, String columnLabel, double value) {
- if (columnLabelBindings == null || rowLabelBindings == null) {
- throw new IllegalStateException("Unbound label");
- }
- Integer row = rowLabelBindings.get(rowLabel);
- Integer col = columnLabelBindings.get(columnLabel);
- if (row == null || col == null) {
- throw new IllegalStateException("Unbound label");
- }
- set(row, col, value);
- }
-
- @Override
- public void set(String rowLabel, String columnLabel, int row, int column, double value) {
- if (rowLabelBindings == null) {
- rowLabelBindings = new HashMap<>();
- }
- rowLabelBindings.put(rowLabel, row);
- if (columnLabelBindings == null) {
- columnLabelBindings = new HashMap<>();
- }
- columnLabelBindings.put(columnLabel, column);
-
- set(row, column, value);
- }
-
- @Override
- public void setColumnLabelBindings(Map<String, Integer> bindings) {
- columnLabelBindings = bindings;
- }
-
- @Override
- public void setRowLabelBindings(Map<String, Integer> bindings) {
- rowLabelBindings = bindings;
- }
-
- // index into int[2] for column value
- public static final int COL = 1;
-
- // index into int[2] for row value
- public static final int ROW = 0;
-
- @Override
- public int numRows() {
- return rowSize();
- }
-
- @Override
- public int numCols() {
- return columnSize();
- }
-
- @Override
- public String asFormatString() {
- return toString();
- }
-
- @Override
- public Matrix assign(double value) {
- int rows = rowSize();
- int columns = columnSize();
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- setQuick(row, col, value);
- }
- }
- return this;
- }
-
- @Override
- public Matrix assign(double[][] values) {
- int rows = rowSize();
- if (rows != values.length) {
- throw new CardinalityException(rows, values.length);
- }
- int columns = columnSize();
- for (int row = 0; row < rows; row++) {
- if (columns == values[row].length) {
- for (int col = 0; col < columns; col++) {
- setQuick(row, col, values[row][col]);
- }
- } else {
- throw new CardinalityException(columns, values[row].length);
- }
- }
- return this;
- }
-
- @Override
- public Matrix assign(Matrix other, DoubleDoubleFunction function) {
- int rows = rowSize();
- if (rows != other.rowSize()) {
- throw new CardinalityException(rows, other.rowSize());
- }
- int columns = columnSize();
- if (columns != other.columnSize()) {
- throw new CardinalityException(columns, other.columnSize());
- }
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- setQuick(row, col, function.apply(getQuick(row, col), other.getQuick(
- row, col)));
- }
- }
- return this;
- }
-
- @Override
- public Matrix assign(Matrix other) {
- int rows = rowSize();
- if (rows != other.rowSize()) {
- throw new CardinalityException(rows, other.rowSize());
- }
- int columns = columnSize();
- if (columns != other.columnSize()) {
- throw new CardinalityException(columns, other.columnSize());
- }
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- setQuick(row, col, other.getQuick(row, col));
- }
- }
- return this;
- }
-
- @Override
- public Matrix assign(DoubleFunction function) {
- int rows = rowSize();
- int columns = columnSize();
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- setQuick(row, col, function.apply(getQuick(row, col)));
- }
- }
- return this;
- }
-
- /**
- * Collects the results of a function applied to each row of a matrix.
- *
- * @param f The function to be applied to each row.
- * @return The vector of results.
- */
- @Override
- public Vector aggregateRows(VectorFunction f) {
- Vector r = new DenseVector(numRows());
- int n = numRows();
- for (int row = 0; row < n; row++) {
- r.set(row, f.apply(viewRow(row)));
- }
- return r;
- }
-
- /**
- * Returns a view of a row. Changes to the view will affect the original.
- *
- * @param row Which row to return.
- * @return A vector that references the desired row.
- */
- @Override
- public Vector viewRow(int row) {
- return new MatrixVectorView(this, row, 0, 0, 1);
- }
-
-
- /**
- * Returns a view of a row. Changes to the view will affect the original.
- *
- * @param column Which column to return.
- * @return A vector that references the desired column.
- */
- @Override
- public Vector viewColumn(int column) {
- return new MatrixVectorView(this, 0, column, 1, 0);
- }
-
- /**
- * Provides a view of the diagonal of a matrix.
- */
- @Override
- public Vector viewDiagonal() {
- return new MatrixVectorView(this, 0, 0, 1, 1);
- }
-
- /**
- * Collects the results of a function applied to each element of a matrix and then aggregated.
- *
- * @param combiner A function that combines the results of the mapper.
- * @param mapper A function to apply to each element.
- * @return The result.
- */
- @Override
- public double aggregate(final DoubleDoubleFunction combiner, final DoubleFunction mapper) {
- return aggregateRows(new VectorFunction() {
- @Override
- public double apply(Vector v) {
- return v.aggregate(combiner, mapper);
- }
- }).aggregate(combiner, Functions.IDENTITY);
- }
-
- /**
- * Collects the results of a function applied to each column of a matrix.
- *
- * @param f The function to be applied to each column.
- * @return The vector of results.
- */
- @Override
- public Vector aggregateColumns(VectorFunction f) {
- Vector r = new DenseVector(numCols());
- for (int col = 0; col < numCols(); col++) {
- r.set(col, f.apply(viewColumn(col)));
- }
- return r;
- }
-
- @Override
- public double determinant() {
- int rows = rowSize();
- int columns = columnSize();
- if (rows != columns) {
- throw new CardinalityException(rows, columns);
- }
-
- if (rows == 2) {
- return getQuick(0, 0) * getQuick(1, 1) - getQuick(0, 1) * getQuick(1, 0);
- } else {
- // TODO: this really should just be one line:
- // TODO: new CholeskyDecomposition(this).getL().viewDiagonal().aggregate(Functions.TIMES)
- int sign = 1;
- double ret = 0;
-
- for (int i = 0; i < columns; i++) {
- Matrix minor = new DenseMatrix(rows - 1, columns - 1);
- for (int j = 1; j < rows; j++) {
- boolean flag = false; /* column offset flag */
- for (int k = 0; k < columns; k++) {
- if (k == i) {
- flag = true;
- continue;
- }
- minor.set(j - 1, flag ? k - 1 : k, getQuick(j, k));
- }
- }
- ret += getQuick(0, i) * sign * minor.determinant();
- sign *= -1;
-
- }
-
- return ret;
- }
-
- }
-
- @SuppressWarnings("CloneDoesntDeclareCloneNotSupportedException")
- @Override
- public Matrix clone() {
- AbstractMatrix clone;
- try {
- clone = (AbstractMatrix) super.clone();
- } catch (CloneNotSupportedException cnse) {
- throw new IllegalStateException(cnse); // can't happen
- }
- if (rowLabelBindings != null) {
- clone.rowLabelBindings = Maps.newHashMap(rowLabelBindings);
- }
- if (columnLabelBindings != null) {
- clone.columnLabelBindings = Maps.newHashMap(columnLabelBindings);
- }
- return clone;
- }
-
- @Override
- public Matrix divide(double x) {
- Matrix result = like();
- for (int row = 0; row < rowSize(); row++) {
- for (int col = 0; col < columnSize(); col++) {
- result.setQuick(row, col, getQuick(row, col) / x);
- }
- }
- return result;
- }
-
- @Override
- public double get(int row, int column) {
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- return getQuick(row, column);
- }
-
- @Override
- public Matrix minus(Matrix other) {
- int rows = rowSize();
- if (rows != other.rowSize()) {
- throw new CardinalityException(rows, other.rowSize());
- }
- int columns = columnSize();
- if (columns != other.columnSize()) {
- throw new CardinalityException(columns, other.columnSize());
- }
- Matrix result = like();
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- result.setQuick(row, col, getQuick(row, col)
- - other.getQuick(row, col));
- }
- }
- return result;
- }
-
- @Override
- public Matrix plus(double x) {
- Matrix result = like();
- int rows = rowSize();
- int columns = columnSize();
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- result.setQuick(row, col, getQuick(row, col) + x);
- }
- }
- return result;
- }
-
- @Override
- public Matrix plus(Matrix other) {
- int rows = rowSize();
- if (rows != other.rowSize()) {
- throw new CardinalityException(rows, other.rowSize());
- }
- int columns = columnSize();
- if (columns != other.columnSize()) {
- throw new CardinalityException(columns, other.columnSize());
- }
- Matrix result = like();
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- result.setQuick(row, col, getQuick(row, col)
- + other.getQuick(row, col));
- }
- }
- return result;
- }
-
- @Override
- public void set(int row, int column, double value) {
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- setQuick(row, column, value);
- }
-
- @Override
- public void set(int row, double[] data) {
- int columns = columnSize();
- if (columns < data.length) {
- throw new CardinalityException(columns, data.length);
- }
- int rows = rowSize();
- if (row < 0 || row >= rows) {
- throw new IndexException(row, rowSize());
- }
- for (int i = 0; i < columns; i++) {
- setQuick(row, i, data[i]);
- }
- }
-
- @Override
- public Matrix times(double x) {
- Matrix result = like();
- int rows = rowSize();
- int columns = columnSize();
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- result.setQuick(row, col, getQuick(row, col) * x);
- }
- }
- return result;
- }
-
- @Override
- public Matrix times(Matrix other) {
- int columns = columnSize();
- if (columns != other.rowSize()) {
- throw new CardinalityException(columns, other.rowSize());
- }
- int rows = rowSize();
- int otherColumns = other.columnSize();
- Matrix result = like(rows, otherColumns);
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < otherColumns; col++) {
- double sum = 0.0;
- for (int k = 0; k < columns; k++) {
- sum += getQuick(row, k) * other.getQuick(k, col);
- }
- result.setQuick(row, col, sum);
- }
- }
- return result;
- }
-
- @Override
- public Vector times(Vector v) {
- int columns = columnSize();
- if (columns != v.size()) {
- throw new CardinalityException(columns, v.size());
- }
- int rows = rowSize();
- Vector w = new DenseVector(rows);
- for (int row = 0; row < rows; row++) {
- w.setQuick(row, v.dot(viewRow(row)));
- }
- return w;
- }
-
- @Override
- public Vector timesSquared(Vector v) {
- int columns = columnSize();
- if (columns != v.size()) {
- throw new CardinalityException(columns, v.size());
- }
- int rows = rowSize();
- Vector w = new DenseVector(columns);
- for (int i = 0; i < rows; i++) {
- Vector xi = viewRow(i);
- double d = xi.dot(v);
- if (d != 0.0) {
- w.assign(xi, new PlusMult(d));
- }
-
- }
- return w;
- }
-
- @Override
- public Matrix transpose() {
- int rows = rowSize();
- int columns = columnSize();
- Matrix result = like(columns, rows);
- for (int row = 0; row < rows; row++) {
- for (int col = 0; col < columns; col++) {
- result.setQuick(col, row, getQuick(row, col));
- }
- }
- return result;
- }
-
- @Override
- public Matrix viewPart(int rowOffset, int rowsRequested, int columnOffset, int columnsRequested) {
- return viewPart(new int[]{rowOffset, columnOffset}, new int[]{rowsRequested, columnsRequested});
- }
-
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
-
- if (offset[ROW] < 0) {
- throw new IndexException(offset[ROW], 0);
- }
- if (offset[ROW] + size[ROW] > rowSize()) {
- throw new IndexException(offset[ROW] + size[ROW], rowSize());
- }
- if (offset[COL] < 0) {
- throw new IndexException(offset[COL], 0);
- }
- if (offset[COL] + size[COL] > columnSize()) {
- throw new IndexException(offset[COL] + size[COL], columnSize());
- }
-
- return new MatrixView(this, offset, size);
- }
-
-
- @Override
- public double zSum() {
- double result = 0;
- for (int row = 0; row < rowSize(); row++) {
- for (int col = 0; col < columnSize(); col++) {
- result += getQuick(row, col);
- }
- }
- return result;
- }
-
- @Override
- public int[] getNumNondefaultElements() {
- return new int[]{rowSize(), columnSize()};
- }
-
- protected static class TransposeViewVector extends AbstractVector {
-
- private final Matrix matrix;
- private final int transposeOffset;
- private final int numCols;
- private final boolean rowToColumn;
-
- protected TransposeViewVector(Matrix m, int offset) {
- this(m, offset, true);
- }
-
- protected TransposeViewVector(Matrix m, int offset, boolean rowToColumn) {
- super(rowToColumn ? m.numRows() : m.numCols());
- matrix = m;
- this.transposeOffset = offset;
- this.rowToColumn = rowToColumn;
- numCols = rowToColumn ? m.numCols() : m.numRows();
- }
-
- @SuppressWarnings("CloneDoesntCallSuperClone")
- @Override
- public Vector clone() {
- Vector v = new DenseVector(size());
- v.assign(this, Functions.PLUS);
- return v;
- }
-
- @Override
- public boolean isDense() {
- return true;
- }
-
- @Override
- public boolean isSequentialAccess() {
- return true;
- }
-
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- return matrix.like(rows, columns);
- }
-
- @Override
- public Iterator<Element> iterator() {
- return new AbstractIterator<Element>() {
- private int i;
-
- @Override
- protected Element computeNext() {
- if (i >= size()) {
- return endOfData();
- }
- return getElement(i++);
- }
- };
- }
-
- /**
- * Currently delegates to {@link #iterator()}.
- * TODO: This could be optimized to at least skip empty rows if there are many of them.
- *
- * @return an iterator (currently dense).
- */
- @Override
- public Iterator<Element> iterateNonZero() {
- return iterator();
- }
-
- @Override
- public Element getElement(final int i) {
- return new Element() {
- @Override
- public double get() {
- return getQuick(i);
- }
-
- @Override
- public int index() {
- return i;
- }
-
- @Override
- public void set(double value) {
- setQuick(i, value);
- }
- };
- }
-
- /**
- * Used internally by assign() to update multiple indices and values at once.
- * Only really useful for sparse vectors (especially SequentialAccessSparseVector).
- * 
- * If someone ever adds a new type of sparse vectors, this method must merge (index, value) pairs into the vector.
- *
- * @param updates a mapping of indices to values to merge in the vector.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- throw new UnsupportedOperationException("Cannot mutate TransposeViewVector");
- }
-
- @Override
- public double getQuick(int index) {
- Vector v = rowToColumn ? matrix.viewColumn(index) : matrix.viewRow(index);
- return v == null ? 0.0 : v.getQuick(transposeOffset);
- }
-
- @Override
- public void setQuick(int index, double value) {
- Vector v = rowToColumn ? matrix.viewColumn(index) : matrix.viewRow(index);
- if (v == null) {
- v = newVector(numCols);
- if (rowToColumn) {
- matrix.assignColumn(index, v);
- } else {
- matrix.assignRow(index, v);
- }
- }
- v.setQuick(transposeOffset, value);
- }
-
- protected Vector newVector(int cardinality) {
- return new DenseVector(cardinality);
- }
-
- @Override
- public Vector like() {
- return new DenseVector(size());
- }
-
- public Vector like(int cardinality) {
- return new DenseVector(cardinality);
- }
-
- /**
- * TODO: currently I don't know of an efficient way to getVector this value correctly.
- *
- * @return the number of nonzero entries
- */
- @Override
- public int getNumNondefaultElements() {
- return size();
- }
-
- @Override
- public double getLookupCost() {
- return (rowToColumn ? matrix.viewColumn(0) : matrix.viewRow(0)).getLookupCost();
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return (rowToColumn ? matrix.viewColumn(0) : matrix.viewRow(0)).getIteratorAdvanceCost();
- }
-
- @Override
- public boolean isAddConstantTime() {
- return (rowToColumn ? matrix.viewColumn(0) : matrix.viewRow(0)).isAddConstantTime();
- }
- }
-
- @Override
- public String toString() {
- int row = 0;
- int maxRowsToDisplay = 10;
- int maxColsToDisplay = 20;
- int colsToDisplay = maxColsToDisplay;
-
- if(maxColsToDisplay > columnSize()){
- colsToDisplay = columnSize();
- }
-
-
- StringBuilder s = new StringBuilder("{\n");
- Iterator<MatrixSlice> it = iterator();
- while ((it.hasNext()) && (row < maxRowsToDisplay)) {
- MatrixSlice next = it.next();
- s.append(" ").append(next.index())
- .append(" =>\t")
- .append(new VectorView(next.vector(), 0, colsToDisplay))
- .append('\n');
- row ++;
- }
- String returnString = s.toString();
- if (maxColsToDisplay <= columnSize()) {
- returnString = returnString.replace("}", " ... } ");
- }
- if(maxRowsToDisplay <= rowSize())
- return returnString + ("... }");
- else{
- return returnString + ("}");
- }
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- throw new UnsupportedOperationException("Flavor support not implemented for this matrix.");
- }
-
- ////////////// Matrix flavor trait ///////////////////
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/AbstractVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/AbstractVector.java b/math/src/main/java/org/apache/mahout/math/AbstractVector.java
deleted file mode 100644
index 27eddbc..0000000
--- a/math/src/main/java/org/apache/mahout/math/AbstractVector.java
+++ /dev/null
@@ -1,684 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Iterator;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.Functions;
-
-/** Implementations of generic capabilities like sum of elements and dot products */
-public abstract class AbstractVector implements Vector, LengthCachingVector {
-
- private int size;
- protected double lengthSquared = -1.0;
-
- protected AbstractVector(int size) {
- this.size = size;
- }
-
- @Override
- public Iterable<Element> all() {
- return new Iterable<Element>() {
- @Override
- public Iterator<Element> iterator() {
- return AbstractVector.this.iterator();
- }
- };
- }
-
- @Override
- public Iterable<Element> nonZeroes() {
- return new Iterable<Element>() {
- @Override
- public Iterator<Element> iterator() {
- return iterateNonZero();
- }
- };
- }
-
- /**
- * Iterates over all elements 
- * NOTE: Implementations may choose to reuse the Element returned for performance
- * reasons, so if you need a copy of it, you should call {@link #getElement(int)} for the given index
- *
- * @return An {@link Iterator} over all elements
- */
- protected abstract Iterator<Element> iterator();
-
- /**
- * Iterates over all non-zero elements. 
- * NOTE: Implementations may choose to reuse the Element returned for
- * performance reasons, so if you need a copy of it, you should call {@link #getElement(int)} for the given index
- *
- * @return An {@link Iterator} over all non-zero elements
- */
- protected abstract Iterator<Element> iterateNonZero();
- /**
- * Aggregates a vector by applying a mapping function fm(x) to every component and aggregating
- * the results with an aggregating function fa(x, y).
- *
- * @param aggregator used to combine the current value of the aggregation with the result of map.apply(nextValue)
- * @param map a function to apply to each element of the vector in turn before passing to the aggregator
- * @return the result of the aggregation
- */
- @Override
- public double aggregate(DoubleDoubleFunction aggregator, DoubleFunction map) {
- if (size == 0) {
- return 0;
- }
-
- // If the aggregator is associative and commutative and it's likeLeftMult (fa(0, y) = 0), and there is
- // at least one zero in the vector (size > getNumNondefaultElements) and applying fm(0) = 0, the result
- // gets cascaded through the aggregation and the final result will be 0.
- if (aggregator.isAssociativeAndCommutative() && aggregator.isLikeLeftMult()
- && size > getNumNondefaultElements() && !map.isDensifying()) {
- return 0;
- }
-
- double result;
- if (isSequentialAccess() || aggregator.isAssociativeAndCommutative()) {
- Iterator<Element> iterator;
- // If fm(0) = 0 and fa(x, 0) = x, we can skip all zero values.
- if (!map.isDensifying() && aggregator.isLikeRightPlus()) {
- iterator = iterateNonZero();
- if (!iterator.hasNext()) {
- return 0;
- }
- } else {
- iterator = iterator();
- }
- Element element = iterator.next();
- result = map.apply(element.get());
- while (iterator.hasNext()) {
- element = iterator.next();
- result = aggregator.apply(result, map.apply(element.get()));
- }
- } else {
- result = map.apply(getQuick(0));
- for (int i = 1; i < size; i++) {
- result = aggregator.apply(result, map.apply(getQuick(i)));
- }
- }
-
- return result;
- }
-
- @Override
- public double aggregate(Vector other, DoubleDoubleFunction aggregator, DoubleDoubleFunction combiner) {
- Preconditions.checkArgument(size == other.size(), "Vector sizes differ");
- if (size == 0) {
- return 0;
- }
- return VectorBinaryAggregate.aggregateBest(this, other, aggregator, combiner);
- }
-
- /**
- * Subclasses must override to return an appropriately sparse or dense result
- *
- * @param rows the row cardinality
- * @param columns the column cardinality
- * @return a Matrix
- */
- protected abstract Matrix matrixLike(int rows, int columns);
-
- @Override
- public Vector viewPart(int offset, int length) {
- if (offset < 0) {
- throw new IndexException(offset, size);
- }
- if (offset + length > size) {
- throw new IndexException(offset + length, size);
- }
- return new VectorView(this, offset, length);
- }
-
- @SuppressWarnings("CloneDoesntDeclareCloneNotSupportedException")
- @Override
- public Vector clone() {
- try {
- AbstractVector r = (AbstractVector) super.clone();
- r.size = size;
- r.lengthSquared = lengthSquared;
- return r;
- } catch (CloneNotSupportedException e) {
- throw new IllegalStateException("Can't happen");
- }
- }
-
- @Override
- public Vector divide(double x) {
- if (x == 1.0) {
- return clone();
- }
- Vector result = createOptimizedCopy();
- for (Element element : result.nonZeroes()) {
- element.set(element.get() / x);
- }
- return result;
- }
-
- @Override
- public double dot(Vector x) {
- if (size != x.size()) {
- throw new CardinalityException(size, x.size());
- }
- if (this == x) {
- return getLengthSquared();
- }
- return aggregate(x, Functions.PLUS, Functions.MULT);
- }
-
- protected double dotSelf() {
- return aggregate(Functions.PLUS, Functions.pow(2));
- }
-
- @Override
- public double get(int index) {
- if (index < 0 || index >= size) {
- throw new IndexException(index, size);
- }
- return getQuick(index);
- }
-
- @Override
- public Element getElement(int index) {
- return new LocalElement(index);
- }
-
- @Override
- public Vector normalize() {
- return divide(Math.sqrt(getLengthSquared()));
- }
-
- @Override
- public Vector normalize(double power) {
- return divide(norm(power));
- }
-
- @Override
- public Vector logNormalize() {
- return logNormalize(2.0, Math.sqrt(getLengthSquared()));
- }
-
- @Override
- public Vector logNormalize(double power) {
- return logNormalize(power, norm(power));
- }
-
- public Vector logNormalize(double power, double normLength) {
- // we can special case certain powers
- if (Double.isInfinite(power) || power <= 1.0) {
- throw new IllegalArgumentException("Power must be > 1 and < infinity");
- } else {
- double denominator = normLength * Math.log(power);
- Vector result = createOptimizedCopy();
- for (Element element : result.nonZeroes()) {
- element.set(Math.log1p(element.get()) / denominator);
- }
- return result;
- }
- }
-
- @Override
- public double norm(double power) {
- if (power < 0.0) {
- throw new IllegalArgumentException("Power must be >= 0");
- }
- // We can special case certain powers.
- if (Double.isInfinite(power)) {
- return aggregate(Functions.MAX, Functions.ABS);
- } else if (power == 2.0) {
- return Math.sqrt(getLengthSquared());
- } else if (power == 1.0) {
- double result = 0.0;
- Iterator<Element> iterator = this.iterateNonZero();
- while (iterator.hasNext()) {
- result += Math.abs(iterator.next().get());
- }
- return result;
- // TODO: this should ideally be used, but it's slower.
- // return aggregate(Functions.PLUS, Functions.ABS);
- } else if (power == 0.0) {
- return getNumNonZeroElements();
- } else {
- return Math.pow(aggregate(Functions.PLUS, Functions.pow(power)), 1.0 / power);
- }
- }
-
- @Override
- public double getLengthSquared() {
- if (lengthSquared >= 0.0) {
- return lengthSquared;
- }
- return lengthSquared = dotSelf();
- }
-
- @Override
- public void invalidateCachedLength() {
- lengthSquared = -1;
- }
-
- @Override
- public double getDistanceSquared(Vector that) {
- if (size != that.size()) {
- throw new CardinalityException(size, that.size());
- }
- double thisLength = getLengthSquared();
- double thatLength = that.getLengthSquared();
- double dot = dot(that);
- double distanceEstimate = thisLength + thatLength - 2 * dot;
- if (distanceEstimate > 1.0e-3 * (thisLength + thatLength)) {
- // The vectors are far enough from each other that the formula is accurate.
- return Math.max(distanceEstimate, 0);
- } else {
- return aggregate(that, Functions.PLUS, Functions.MINUS_SQUARED);
- }
- }
-
- @Override
- public double maxValue() {
- if (size == 0) {
- return Double.NEGATIVE_INFINITY;
- }
- return aggregate(Functions.MAX, Functions.IDENTITY);
- }
-
- @Override
- public int maxValueIndex() {
- int result = -1;
- double max = Double.NEGATIVE_INFINITY;
- int nonZeroElements = 0;
- Iterator<Element> iter = this.iterateNonZero();
- while (iter.hasNext()) {
- nonZeroElements++;
- Element element = iter.next();
- double tmp = element.get();
- if (tmp > max) {
- max = tmp;
- result = element.index();
- }
- }
- // if the maxElement is negative and the vector is sparse then any
- // unfilled element(0.0) could be the maxValue hence we need to
- // find one of those elements
- if (nonZeroElements < size && max < 0.0) {
- for (Element element : all()) {
- if (element.get() == 0.0) {
- return element.index();
- }
- }
- }
- return result;
- }
-
- @Override
- public double minValue() {
- if (size == 0) {
- return Double.POSITIVE_INFINITY;
- }
- return aggregate(Functions.MIN, Functions.IDENTITY);
- }
-
- @Override
- public int minValueIndex() {
- int result = -1;
- double min = Double.POSITIVE_INFINITY;
- int nonZeroElements = 0;
- Iterator<Element> iter = this.iterateNonZero();
- while (iter.hasNext()) {
- nonZeroElements++;
- Element element = iter.next();
- double tmp = element.get();
- if (tmp < min) {
- min = tmp;
- result = element.index();
- }
- }
- // if the maxElement is positive and the vector is sparse then any
- // unfilled element(0.0) could be the maxValue hence we need to
- // find one of those elements
- if (nonZeroElements < size && min > 0.0) {
- for (Element element : all()) {
- if (element.get() == 0.0) {
- return element.index();
- }
- }
- }
- return result;
- }
-
- @Override
- public Vector plus(double x) {
- Vector result = createOptimizedCopy();
- if (x == 0.0) {
- return result;
- }
- return result.assign(Functions.plus(x));
- }
-
- @Override
- public Vector plus(Vector that) {
- if (size != that.size()) {
- throw new CardinalityException(size, that.size());
- }
- return createOptimizedCopy().assign(that, Functions.PLUS);
- }
-
- @Override
- public Vector minus(Vector that) {
- if (size != that.size()) {
- throw new CardinalityException(size, that.size());
- }
- return createOptimizedCopy().assign(that, Functions.MINUS);
- }
-
- @Override
- public void set(int index, double value) {
- if (index < 0 || index >= size) {
- throw new IndexException(index, size);
- }
- setQuick(index, value);
- }
-
- @Override
- public void incrementQuick(int index, double increment) {
- setQuick(index, getQuick(index) + increment);
- }
-
- @Override
- public Vector times(double x) {
- if (x == 0.0) {
- return like();
- }
- return createOptimizedCopy().assign(Functions.mult(x));
- }
-
- /**
- * Copy the current vector in the most optimum fashion. Used by immutable methods like plus(), minus().
- * Use this instead of vector.like().assign(vector). Sub-class can choose to override this method.
- *
- * @return a copy of the current vector.
- */
- protected Vector createOptimizedCopy() {
- return createOptimizedCopy(this);
- }
-
- private static Vector createOptimizedCopy(Vector vector) {
- Vector result;
- if (vector.isDense()) {
- result = vector.like().assign(vector, Functions.SECOND_LEFT_ZERO);
- } else {
- result = vector.clone();
- }
- return result;
- }
-
- @Override
- public Vector times(Vector that) {
- if (size != that.size()) {
- throw new CardinalityException(size, that.size());
- }
-
- if (this.getNumNondefaultElements() <= that.getNumNondefaultElements()) {
- return createOptimizedCopy(this).assign(that, Functions.MULT);
- } else {
- return createOptimizedCopy(that).assign(this, Functions.MULT);
- }
- }
-
- @Override
- public double zSum() {
- return aggregate(Functions.PLUS, Functions.IDENTITY);
- }
-
- @Override
- public int getNumNonZeroElements() {
- int count = 0;
- Iterator<Element> it = iterateNonZero();
- while (it.hasNext()) {
- if (it.next().get() != 0.0) {
- count++;
- }
- }
- return count;
- }
-
- @Override
- public Vector assign(double value) {
- Iterator<Element> it;
- if (value == 0.0) {
- // Make all the non-zero values 0.
- it = iterateNonZero();
- while (it.hasNext()) {
- it.next().set(value);
- }
- } else {
- if (isSequentialAccess() && !isAddConstantTime()) {
- // Update all the non-zero values and queue the updates for the zero vaues.
- // The vector will become dense.
- it = iterator();
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping();
- while (it.hasNext()) {
- Element element = it.next();
- if (element.get() == 0.0) {
- updates.set(element.index(), value);
- } else {
- element.set(value);
- }
- }
- mergeUpdates(updates);
- } else {
- for (int i = 0; i < size; ++i) {
- setQuick(i, value);
- }
- }
- }
- invalidateCachedLength();
- return this;
- }
-
- @Override
- public Vector assign(double[] values) {
- if (size != values.length) {
- throw new CardinalityException(size, values.length);
- }
- if (isSequentialAccess() && !isAddConstantTime()) {
- OrderedIntDoubleMapping updates = new OrderedIntDoubleMapping();
- Iterator<Element> it = iterator();
- while (it.hasNext()) {
- Element element = it.next();
- int index = element.index();
- if (element.get() == 0.0) {
- updates.set(index, values[index]);
- } else {
- element.set(values[index]);
- }
- }
- mergeUpdates(updates);
- } else {
- for (int i = 0; i < size; ++i) {
- setQuick(i, values[i]);
- }
- }
- invalidateCachedLength();
- return this;
- }
-
- @Override
- public Vector assign(Vector other) {
- return assign(other, Functions.SECOND);
- }
-
- @Override
- public Vector assign(DoubleDoubleFunction f, double y) {
- Iterator<Element> iterator = f.apply(0, y) == 0 ? iterateNonZero() : iterator();
- while (iterator.hasNext()) {
- Element element = iterator.next();
- element.set(f.apply(element.get(), y));
- }
- invalidateCachedLength();
- return this;
- }
-
- @Override
- public Vector assign(DoubleFunction f) {
- Iterator<Element> iterator = !f.isDensifying() ? iterateNonZero() : iterator();
- while (iterator.hasNext()) {
- Element element = iterator.next();
- element.set(f.apply(element.get()));
- }
- invalidateCachedLength();
- return this;
- }
-
- @Override
- public Vector assign(Vector other, DoubleDoubleFunction function) {
- if (size != other.size()) {
- throw new CardinalityException(size, other.size());
- }
- VectorBinaryAssign.assignBest(this, other, function);
- invalidateCachedLength();
- return this;
- }
-
- @Override
- public Matrix cross(Vector other) {
- Matrix result = matrixLike(size, other.size());
- Iterator<Vector.Element> it = iterateNonZero();
- while (it.hasNext()) {
- Vector.Element e = it.next();
- int row = e.index();
- result.assignRow(row, other.times(getQuick(row)));
- }
- return result;
- }
-
- @Override
- public final int size() {
- return size;
- }
-
- @Override
- public String asFormatString() {
- return toString();
- }
-
- @Override
- public int hashCode() {
- int result = size;
- Iterator<Element> iter = iterateNonZero();
- while (iter.hasNext()) {
- Element ele = iter.next();
- result += ele.index() * RandomUtils.hashDouble(ele.get());
- }
- return result;
- }
-
- /**
- * Determines whether this {@link Vector} represents the same logical vector as another
- * object. Two {@link Vector}s are equal (regardless of implementation) if the value at
- * each index is the same, and the cardinalities are the same.
- */
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (!(o instanceof Vector)) {
- return false;
- }
- Vector that = (Vector) o;
- return size == that.size() && aggregate(that, Functions.PLUS, Functions.MINUS_ABS) == 0.0;
- }
-
- @Override
- public String toString() {
- return toString(null);
- }
-
- public String toString(String[] dictionary) {
- StringBuilder result = new StringBuilder();
- result.append('{');
- for (int index = 0; index < size; index++) {
- double value = getQuick(index);
- if (value != 0.0) {
- result.append(dictionary != null && dictionary.length > index ? dictionary[index] : index);
- result.append(':');
- result.append(value);
- result.append(',');
- }
- }
- if (result.length() > 1) {
- result.setCharAt(result.length() - 1, '}');
- } else {
- result.append('}');
- }
- return result.toString();
- }
-
- /**
- * toString() implementation for sparse vectors via {@link #nonZeroes()} method
- * @return String representation of the vector
- */
- public String sparseVectorToString() {
- Iterator<Element> it = iterateNonZero();
- if (!it.hasNext()) {
- return "{}";
- }
- else {
- StringBuilder result = new StringBuilder();
- result.append('{');
- while (it.hasNext()) {
- Vector.Element e = it.next();
- result.append(e.index());
- result.append(':');
- result.append(e.get());
- result.append(',');
- }
- result.setCharAt(result.length() - 1, '}');
- return result.toString();
- }
- }
-
- protected final class LocalElement implements Element {
- int index;
-
- LocalElement(int index) {
- this.index = index;
- }
-
- @Override
- public double get() {
- return getQuick(index);
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set(double value) {
- setQuick(index, value);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Algebra.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Algebra.java b/math/src/main/java/org/apache/mahout/math/Algebra.java
deleted file mode 100644
index 3049057..0000000
--- a/math/src/main/java/org/apache/mahout/math/Algebra.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-public final class Algebra {
-
- private Algebra() {
- }
-
- public static Vector mult(Matrix m, Vector v) {
- if (m.numRows() != v.size()) {
- throw new CardinalityException(m.numRows(), v.size());
- }
- // Use a Dense Vector for the moment,
- Vector result = new DenseVector(m.numRows());
-
- for (int i = 0; i < m.numRows(); i++) {
- result.set(i, m.viewRow(i).dot(v));
- }
-
- return result;
- }
-
- /** Returns sqrt(a^2 + b^2) without under/overflow. */
- public static double hypot(double a, double b) {
- double r;
- if (Math.abs(a) > Math.abs(b)) {
- r = b / a;
- r = Math.abs(a) * Math.sqrt(1 + r * r);
- } else if (b != 0) {
- r = a / b;
- r = Math.abs(b) * Math.sqrt(1 + r * r);
- } else {
- r = 0.0;
- }
- return r;
- }
-
- /**
- * Compute Maximum Absolute Row Sum Norm of input Matrix m
- * http://mathworld.wolfram.com/MaximumAbsoluteRowSumNorm.html
- */
- public static double getNorm(Matrix m) {
- double max = 0.0;
- for (int i = 0; i < m.numRows(); i++) {
- int sum = 0;
- Vector cv = m.viewRow(i);
- for (int j = 0; j < cv.size(); j++) {
- sum += (int) Math.abs(cv.getQuick(j));
- }
- if (sum > max) {
- max = sum;
- }
- }
- return max;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Arrays.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Arrays.java b/math/src/main/java/org/apache/mahout/math/Arrays.java
deleted file mode 100644
index 802ffb7..0000000
--- a/math/src/main/java/org/apache/mahout/math/Arrays.java
+++ /dev/null
@@ -1,662 +0,0 @@
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math;
-
-/**
- * Array manipulations; complements <tt>java.util.Arrays</tt>.
- *
- * @see java.util.Arrays
- * @see org.apache.mahout.math.Sorting
- *
- */
-public final class Arrays {
-
- private Arrays() {
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static byte[] ensureCapacity(byte[] array, int minCapacity) {
- int oldCapacity = array.length;
- byte[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new byte[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static char[] ensureCapacity(char[] array, int minCapacity) {
- int oldCapacity = array.length;
- char[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new char[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static double[] ensureCapacity(double[] array, int minCapacity) {
- int oldCapacity = array.length;
- double[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new double[newCapacity];
- //for (int i = oldCapacity; --i >= 0; ) newArray[i] = array[i];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static float[] ensureCapacity(float[] array, int minCapacity) {
- int oldCapacity = array.length;
- float[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new float[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static int[] ensureCapacity(int[] array, int minCapacity) {
- int oldCapacity = array.length;
- int[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new int[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static long[] ensureCapacity(long[] array, int minCapacity) {
- int oldCapacity = array.length;
- long[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new long[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static Object[] ensureCapacity(Object[] array, int minCapacity) {
- int oldCapacity = array.length;
- Object[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new Object[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static short[] ensureCapacity(short[] array, int minCapacity) {
- int oldCapacity = array.length;
- short[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new short[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Ensures that a given array can hold up to <tt>minCapacity</tt> elements.
- *
- * Returns the identical array if it can hold at least the number of elements specified. Otherwise, returns a new
- * array with increased capacity containing the same elements, ensuring that it can hold at least the number of
- * elements specified by the minimum capacity argument.
- *
- * @param minCapacity the desired minimum capacity.
- */
- public static boolean[] ensureCapacity(boolean[] array, int minCapacity) {
- int oldCapacity = array.length;
- boolean[] newArray;
- if (minCapacity > oldCapacity) {
- int newCapacity = (oldCapacity * 3) / 2 + 1;
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
-
- newArray = new boolean[newCapacity];
- System.arraycopy(array, 0, newArray, 0, oldCapacity);
- } else {
- newArray = array;
- }
- return newArray;
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(byte[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(char[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(double[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(float[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(int[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(long[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(Object[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(short[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the specified array. The string representation consists of a list of the
- * arrays's elements, enclosed in square brackets (<tt>"[]"</tt>). Adjacent elements are separated by the characters
- * <tt>", "</tt> (comma and space).
- *
- * @return a string representation of the specified array.
- */
- public static String toString(boolean[] array) {
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = array.length - 1;
- for (int i = 0; i <= maxIndex; i++) {
- buf.append(array[i]);
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static byte[] trimToCapacity(byte[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- byte[] oldArray = array;
- array = new byte[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static char[] trimToCapacity(char[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- char[] oldArray = array;
- array = new char[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static double[] trimToCapacity(double[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- double[] oldArray = array;
- array = new double[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static float[] trimToCapacity(float[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- float[] oldArray = array;
- array = new float[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static int[] trimToCapacity(int[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- int[] oldArray = array;
- array = new int[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static long[] trimToCapacity(long[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- long[] oldArray = array;
- array = new long[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static Object[] trimToCapacity(Object[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- Object[] oldArray = array;
- array = new Object[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static short[] trimToCapacity(short[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- short[] oldArray = array;
- array = new short[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * Ensures that the specified array cannot hold more than <tt>maxCapacity</tt> elements. An application can use this
- * operation to minimize array storage. Returns the identical array if <tt>array.length <= maxCapacity</tt>.
- * Otherwise, returns a new array with a length of <tt>maxCapacity</tt> containing the first <tt>maxCapacity</tt>
- * elements of <tt>array</tt>.
- *
- * @param maxCapacity the desired maximum capacity.
- */
- public static boolean[] trimToCapacity(boolean[] array, int maxCapacity) {
- if (array.length > maxCapacity) {
- boolean[] oldArray = array;
- array = new boolean[maxCapacity];
- System.arraycopy(oldArray, 0, array, 0, maxCapacity);
- }
- return array;
- }
-
- /**
- * {@link java.util.Arrays#copyOf} compatibility with Java 1.5.
- */
- public static byte[] copyOf(byte[] src, int length) {
- byte[] result = new byte [length];
- System.arraycopy(src, 0, result, 0, Math.min(length, src.length));
- return result;
- }
-
- /**
- * {@link java.util.Arrays#copyOf} compatibility with Java 1.5.
- */
- public static char[] copyOf(char[] src, int length) {
- char[] result = new char [length];
- System.arraycopy(src, 0, result, 0, Math.min(length, src.length));
- return result;
- }
-
- /**
- * {@link java.util.Arrays#copyOf} compatibility with Java 1.5.
- */
- public static short[] copyOf(short[] src, int length) {
- short[] result = new short [length];
- System.arraycopy(src, 0, result, 0, Math.min(length, src.length));
- return result;
- }
-
- /**
- * {@link java.util.Arrays#copyOf} compatibility with Java 1.5.
- */
- public static int[] copyOf(int[] src, int length) {
- int[] result = new int [length];
- System.arraycopy(src, 0, result, 0, Math.min(length, src.length));
- return result;
- }
-
- /**
- * {@link java.util.Arrays#copyOf} compatibility with Java 1.5.
- */
- public static float[] copyOf(float[] src, int length) {
- float[] result = new float [length];
- System.arraycopy(src, 0, result, 0, Math.min(length, src.length));
- return result;
- }
-
- /**
- * {@link java.util.Arrays#copyOf} compatibility with Java 1.5.
- */
- public static double[] copyOf(double[] src, int length) {
- double[] result = new double [length];
- System.arraycopy(src, 0, result, 0, Math.min(length, src.length));
- return result;
- }
-
- /**
- * {@link java.util.Arrays#copyOf} compatibility with Java 1.5.
- */
- public static long[] copyOf(long[] src, int length) {
- long[] result = new long [length];
- System.arraycopy(src, 0, result, 0, Math.min(length, src.length));
- return result;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/BinarySearch.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/BinarySearch.java b/math/src/main/java/org/apache/mahout/math/BinarySearch.java
deleted file mode 100644
index ddb04a7..0000000
--- a/math/src/main/java/org/apache/mahout/math/BinarySearch.java
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Comparator;
-
-public final class BinarySearch {
-
- private BinarySearch() {}
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code byte} array to search.
- * @param value
- * the {@code byte} element to find.
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- */
- public static int binarySearchFromTo(byte[] array, byte value, int from, int to) {
- int mid = -1;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if (value > array[mid]) {
- from = mid + 1;
- } else if (value == array[mid]) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- if (mid < 0) {
- return -1;
- }
-
- return -mid - (value < array[mid] ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code char} array to search.
- * @param value
- * the {@code char} element to find.
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- */
- public static int binarySearchFromTo(char[] array, char value, int from, int to) {
- int mid = -1;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if (value > array[mid]) {
- from = mid + 1;
- } else if (value == array[mid]) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- if (mid < 0) {
- return -1;
- }
- return -mid - (value < array[mid] ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code double} array to search.
- * @param value
- * the {@code double} element to find.
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- */
- public static int binarySearchFromTo(double[] array, double value, int from, int to) {
- long longBits = Double.doubleToLongBits(value);
- int mid = -1;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if (lessThan(array[mid], value)) {
- from = mid + 1;
- } else if (longBits == Double.doubleToLongBits(array[mid])) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- if (mid < 0) {
- return -1;
- }
- return -mid - (lessThan(value, array[mid]) ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code float} array to search.
- * @param value
- * the {@code float} element to find.
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- */
- public static int binarySearchFromTo(float[] array, float value, int from, int to) {
- int intBits = Float.floatToIntBits(value);
- int mid = -1;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if (lessThan(array[mid], value)) {
- from = mid + 1;
- } else if (intBits == Float.floatToIntBits(array[mid])) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- if (mid < 0) {
- return -1;
- }
- return -mid - (lessThan(value, array[mid]) ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code int} array to search.
- * @param value
- * the {@code int} element to find.
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- */
- public static int binarySearchFromTo(int[] array, int value, int from, int to) {
- int mid = -1;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if (value > array[mid]) {
- from = mid + 1;
- } else if (value == array[mid]) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- if (mid < 0) {
- return -1;
- }
- return -mid - (value < array[mid] ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code long} array to search.
- * @param value
- * the {@code long} element to find.
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- */
- public static int binarySearchFromTo(long[] array, long value, int from, int to) {
- int mid = -1;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if (value > array[mid]) {
- from = mid + 1;
- } else if (value == array[mid]) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- if (mid < 0) {
- return -1;
- }
- return -mid - (value < array[mid] ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code Object} array to search.
- * @param object
- * the {@code Object} element to find
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- *
- */
- public static <T extends Comparable<T>> int binarySearchFromTo(T[] array, T object, int from, int to) {
- if (array.length == 0) {
- return -1;
- }
-
- int mid = 0;
- int result = 0;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if ((result = array[mid].compareTo(object)) < 0) {
- from = mid + 1;
- } else if (result == 0) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- return -mid - (result >= 0 ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array using the {@code Comparator} to compare elements.
- * Searching in an unsorted array has an undefined result. It's also undefined
- * which element is found if there are multiple occurrences of the same
- * element.
- *
- * @param array
- * the sorted array to search
- * @param object
- * the element to find
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @param comparator
- * the {@code Comparator} used to compare the elements.
- * @return the non-negative index of the element, or a negative index which
- */
- public static <T> int binarySearchFromTo(T[] array, T object, int from, int to, Comparator<? super T> comparator) {
- int mid = 0;
- int result = 0;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if ((result = comparator.compare(array[mid], object)) < 0) {
- from = mid + 1;
- } else if (result == 0) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- return -mid - (result >= 0 ? 1 : 2);
- }
-
- /**
- * Performs a binary search for the specified element in the specified
- * ascending sorted array. Searching in an unsorted array has an undefined
- * result. It's also undefined which element is found if there are multiple
- * occurrences of the same element.
- *
- * @param array
- * the sorted {@code short} array to search.
- * @param value
- * the {@code short} element to find.
- * @param from
- * the first index to sort, inclusive.
- * @param to
- * the last index to sort, inclusive.
- * @return the non-negative index of the element, or a negative index which is
- * {@code -index - 1} where the element would be inserted.
- */
- public static int binarySearchFromTo(short[] array, short value, int from, int to) {
- int mid = -1;
- while (from <= to) {
- mid = (from + to) >>> 1;
- if (value > array[mid]) {
- from = mid + 1;
- } else if (value == array[mid]) {
- return mid;
- } else {
- to = mid - 1;
- }
- }
- if (mid < 0) {
- return -1;
- }
- return -mid - (value < array[mid] ? 1 : 2);
- }
-
- private static boolean lessThan(double double1, double double2) {
- // A slightly specialized version of
- // Double.compare(double1, double2) < 0.
-
- // Non-zero and non-NaN checking.
- if (double1 < double2) {
- return true;
- }
- if (double1 > double2) {
- return false;
- }
- if (double1 == double2 && double1 != 0.0) {
- return false;
- }
-
- // NaNs are equal to other NaNs and larger than any other double.
- if (Double.isNaN(double1)) {
- return false;
- }
- if (Double.isNaN(double2)) {
- return true;
- }
-
- // Deal with +0.0 and -0.0.
- long d1 = Double.doubleToRawLongBits(double1);
- long d2 = Double.doubleToRawLongBits(double2);
- return d1 < d2;
- }
-
- private static boolean lessThan(float float1, float float2) {
- // A slightly specialized version of Float.compare(float1, float2) < 0.
-
- // Non-zero and non-NaN checking.
- if (float1 < float2) {
- return true;
- }
- if (float1 > float2) {
- return false;
- }
- if (float1 == float2 && float1 != 0.0f) {
- return false;
- }
-
- // NaNs are equal to other NaNs and larger than any other float
- if (Float.isNaN(float1)) {
- return false;
- }
- if (Float.isNaN(float2)) {
- return true;
- }
-
- // Deal with +0.0 and -0.0
- int f1 = Float.floatToRawIntBits(float1);
- int f2 = Float.floatToRawIntBits(float2);
- return f1 < f2;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/CardinalityException.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/CardinalityException.java b/math/src/main/java/org/apache/mahout/math/CardinalityException.java
deleted file mode 100644
index 04e7602..0000000
--- a/math/src/main/java/org/apache/mahout/math/CardinalityException.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-/**
- * Exception thrown when there is a cardinality mismatch in matrix or vector operations.
- * For example, vectors of differing cardinality cannot be added.
- */
-public class CardinalityException extends IllegalArgumentException {
-
- public CardinalityException(int expected, int cardinality) {
- super("Required cardinality " + expected + " but got " + cardinality);
- }
-
-}

r***@apache.org

2018-06-27 14:51:42 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/map/OpenObjectValueTypeHashMap.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/map/OpenObjectValueTypeHashMap.java.t b/math/src/main/java-templates/org/apache/mahout/math/map/OpenObjectValueTypeHashMap.java.t
deleted file mode 100644
index 924c7e2..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/map/OpenObjectValueTypeHashMap.java.t
+++ /dev/null
@@ -1,567 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.mahout.math.function.Object${valueTypeCap}Procedure;
-import org.apache.mahout.math.function.ObjectProcedure;
-import org.apache.mahout.math.list.${valueTypeCap}ArrayList;
-
-/**
- * Open hash map from Object keys to ${valueType} values.
- **/
-public class OpenObject${valueTypeCap}HashMap<T> extends AbstractObject${valueTypeCap}Map<T> {
- protected static final byte FREE = 0;
- protected static final byte FULL = 1;
- protected static final byte REMOVED = 2;
- protected static final Object NO_KEY_VALUE = null;
-
- /** The hash table keys. */
- private Object[] table;
-
- /** The hash table values. */
- private ${valueType}[] values;
-
- /** The state of each hash table entry (FREE, FULL, REMOVED). */
- private byte[] state;
-
- /** The number of table entries in state==FREE. */
- private int freeEntries;
-
-
- /** Constructs an empty map with default capacity and default load factors. */
- public OpenObject${valueTypeCap}HashMap() {
- this(DEFAULT_CAPACITY);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and default load factors.
- *
- * @param initialCapacity the initial capacity of the map.
- * @throws IllegalArgumentException if the initial capacity is less than zero.
- */
- public OpenObject${valueTypeCap}HashMap(int initialCapacity) {
- this(initialCapacity, DEFAULT_MIN_LOAD_FACTOR, DEFAULT_MAX_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
- *
- * @param initialCapacity the initial capacity.
- * @param minLoadFactor the minimum load factor.
- * @param maxLoadFactor the maximum load factor.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- public OpenObject${valueTypeCap}HashMap(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- setUp(initialCapacity, minLoadFactor, maxLoadFactor);
- }
-
- /** Removes all (key,value) associations from the receiver. Implicitly calls <tt>trimToSize()</tt>. */
- @Override
- public void clear() {
- Arrays.fill(this.state, FREE);
- Arrays.fill(this.table, null);
-
- distinct = 0;
- freeEntries = table.length; // delta
- trimToSize();
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @Override
- @SuppressWarnings("unchecked")
- public Object clone() {
- OpenObject${valueTypeCap}HashMap copy = (OpenObject${valueTypeCap}HashMap) super.clone();
- copy.table = copy.table.clone();
- copy.values = copy.values.clone();
- copy.state = copy.state.clone();
- return copy;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- @Override
- public boolean containsKey(T key) {
- return indexOfKey(key) >= 0;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified value.
- *
- * @return <tt>true</tt> if the receiver contains the specified value.
- */
- @Override
- public boolean containsValue(${valueType} value) {
- return indexOfValue(value) >= 0;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
- * method never need be called; it is for performance tuning only. Calling this method before <tt>put()</tt>ing a
- * large number of associations boosts performance, because the receiver will grow only once instead of potentially
- * many times and hash collisions get less probable.
- *
- * @param minCapacity the desired minimum capacity.
- */
- @Override
- public void ensureCapacity(int minCapacity) {
- if (table.length < minCapacity) {
- int newCapacity = nextPrime(minCapacity);
- rehash(newCapacity);
- }
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @Override
- @SuppressWarnings("unchecked")
- public boolean forEachKey(ObjectProcedure<T> procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply((T)table[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Applies a procedure to each (key,value) pair of the receiver, if any. Iteration order is guaranteed to be
- * identical to the order used by method {@link #forEachKey(ObjectProcedure)}.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @Override
- @SuppressWarnings("unchecked")
- public boolean forEachPair(Object${valueTypeCap}Procedure<T> procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply((T)table[i], values[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the value associated with the specified key. It is often a good idea to first check with
- * {@link #containsKey(Object)} whether the given key has a value associated or not,
- * i.e. whether there exists an association for the given key or not.
- *
- * @param key the key to be searched for.
- * @return the value associated with the specified key; <tt>0</tt> if no such key is present.
- */
- @Override
- public ${valueType} get(T key) {
- final int i = indexOfKey(key);
- if (i < 0) {
- return 0;
- } //not contained
- return values[i];
- }
-
- /**
- * @param key the key to be added to the receiver.
- * @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
- * key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
- * at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
- * slot index.
- */
- protected int indexOfInsertion(T key) {
- final int length = table.length;
-
- final int hash = key.hashCode() & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a removed or free slot, or if we find the key itself
- // do NOT skip over removed slots (yes, open addressing is like that...)
- while (state[i] == FULL && !equalsMindTheNull(table[i], key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == REMOVED) {
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- // assertion: there is at least one FREE slot.
- final int j = i;
- while (state[i] != FREE && (state[i] == REMOVED || !equalsMindTheNull(table[i], key))) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
- if (state[i] == FREE) {
- i = j;
- }
- }
-
-
- if (state[i] == FULL) {
- // key already contained at slot i.
- // return a negative number identifying the slot.
- return -i - 1;
- }
- // not already contained, should be inserted at slot i.
- // return a number >= 0 identifying the slot.
- return i;
- }
-
- /**
- * @param key the key to be searched in the receiver.
- * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
- */
- protected int indexOfKey(T key) {
- final int length = table.length;
-
- final int hash = key.hashCode() & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- while (state[i] != FREE && (state[i] == REMOVED || !equalsMindTheNull(table[i], key))) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == FREE) {
- return -1;
- } // not found
- return i; //found, return index where key is contained
- }
-
- /**
- * @param value the value to be searched in the receiver.
- * @return the index where the value is contained in the receiver, returns -1 if the value was not found.
- */
- protected int indexOfValue(${valueType} value) {
- ${valueType}[] val = values;
- byte[] stat = state;
-
- for (int i = stat.length; --i >= 0;) {
- if (stat[i] == FULL && val[i] == value) {
- return i;
- }
- }
-
- return -1; // not found
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * This method can be used
- * to iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @Override
- @SuppressWarnings("unchecked")
- public void keys(List<T> list) {
- list.clear();
-
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL) {
- list.add((T)table[i]);
- }
- }
- }
-
- /**
- * Fills all pairs satisfying a given condition into the specified lists. Fills into the lists, starting at index 0.
- * After this call returns the specified lists both have a new size, the number of pairs satisfying the condition.
- * Example: 
- * <pre>
- * Object${valueTypeCap}Procedure<T> condition = new Object${valueTypeCap}Procedure<T>() { // match even values only
- * public boolean apply(T key, ${valueType} value) { return value%2==0; }
- * }
- * keys = (8,7,6), values = (1,2,2) --> keyList = (6,8), valueList = (2,1)</tt>
- * </pre>
- *
- * @param condition the condition to be matched. Takes the current key as first and the current value as second
- * argument.
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- @Override
- @SuppressWarnings("unchecked")
- public void pairsMatching(Object${valueTypeCap}Procedure<T> condition,
- List<T> keyList,
- ${valueTypeCap}ArrayList valueList) {
- keyList.clear();
- valueList.clear();
-
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && condition.apply((T)table[i], values[i])) {
- keyList.add((T)table[i]);
- valueList.add(values[i]);
- }
- }
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- @Override
- public boolean put(T key, ${valueType} value) {
- int i = indexOfInsertion(key);
- if (i < 0) { //already contained
- i = -i - 1;
- this.values[i] = value;
- return false;
- }
-
- if (this.distinct > this.highWaterMark) {
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return put(key, value);
- }
-
- this.table[i] = key;
- this.values[i] = value;
- if (this.state[i] == FREE) {
- this.freeEntries--;
- }
- this.state[i] = FULL;
- this.distinct++;
-
- if (this.freeEntries < 1) { //delta
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- @Override
- public ${valueType} adjustOrPutValue(T key, ${valueType} newValue, ${valueType} incrValue) {
- int i = indexOfInsertion(key);
- if (i < 0) { //already contained
- i = -i - 1;
- this.values[i] += incrValue;
- return this.values[i];
- } else {
- put(key, newValue);
- return newValue;
- }
- }
-
- /**
- * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
- * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
- * mark.
- */
- @SuppressWarnings("unchecked")
- protected void rehash(int newCapacity) {
- int oldCapacity = table.length;
- //if (oldCapacity == newCapacity) return;
-
- Object[] oldTable = table;
- ${valueType}[] oldValues = values;
- byte[] oldState = state;
-
- this.table = new Object[newCapacity];
- this.values = new ${valueType}[newCapacity];
- this.state = new byte[newCapacity];
-
- this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
- this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
-
- this.freeEntries = newCapacity - this.distinct; // delta
-
- for (int i = oldCapacity; i-- > 0;) {
- if (oldState[i] == FULL) {
- Object element = oldTable[i];
- int index = indexOfInsertion((T)element);
- this.table[index] = element;
- this.values[index] = oldValues[i];
- this.state[index] = FULL;
- }
- }
- }
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- @Override
- public boolean removeKey(T key) {
- int i = indexOfKey(key);
- if (i < 0) {
- return false;
- } // key not contained
-
- this.state[i] = REMOVED;
- //this.values[i]=0; // delta
- this.distinct--;
-
- if (this.distinct < this.lowWaterMark) {
- int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Initializes the receiver.
- *
- * @param initialCapacity the initial capacity of the receiver.
- * @param minLoadFactor the minLoadFactor of the receiver.
- * @param maxLoadFactor the maxLoadFactor of the receiver.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- @Override
- final protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- int capacity = initialCapacity;
- super.setUp(capacity, minLoadFactor, maxLoadFactor);
- capacity = nextPrime(capacity);
- if (capacity == 0) {
- capacity = 1;
- } // open addressing needs at least one FREE slot at any time.
-
- this.table = new Object[capacity];
- this.values = new ${valueType}[capacity];
- this.state = new byte[capacity];
-
- // memory will be exhausted long before this pathological case happens, anyway.
- this.minLoadFactor = minLoadFactor;
- if (capacity == PrimeFinder.LARGEST_PRIME) {
- this.maxLoadFactor = 1.0;
- } else {
- this.maxLoadFactor = maxLoadFactor;
- }
-
- this.distinct = 0;
- this.freeEntries = capacity; // delta
-
- // lowWaterMark will be established upon first expansion.
- // establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
- // After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
- // See ensureCapacity(...)
- this.lowWaterMark = 0;
- this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- // * 1.2 because open addressing's performance exponentially degrades beyond that point
- // so that even rehashing the table can take very long
- int newCapacity = nextPrime((int) (1 + 1.2 * size()));
- if (table.length > newCapacity) {
- rehash(newCapacity);
- }
- }
-
- /**
- * Fills all values contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * This method can be used
- * to iterate over the values of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @Override
- public void values(${valueTypeCap}ArrayList list) {
- list.setSize(distinct);
- ${valueType}[] elements = list.elements();
-
- int j = 0;
- for (int i = state.length; i-- > 0;) {
- if (state[i] == FULL) {
- elements[j++] = values[i];
- }
- }
- }
-
- /**
- * Access for unit tests.
- * @param capacity
- * @param minLoadFactor
- * @param maxLoadFactor
- */
- protected void getInternalFactors(int[] capacity,
- double[] minLoadFactor,
- double[] maxLoadFactor) {
- capacity[0] = table.length;
- minLoadFactor[0] = this.minLoadFactor;
- maxLoadFactor[0] = this.maxLoadFactor;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/set/AbstractKeyTypeSet.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/set/AbstractKeyTypeSet.java.t b/math/src/main/java-templates/org/apache/mahout/math/set/AbstractKeyTypeSet.java.t
deleted file mode 100644
index 2b451b7..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/set/AbstractKeyTypeSet.java.t
+++ /dev/null
@@ -1,181 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.set;
-
-import org.apache.mahout.math.function.${keyTypeCap}Procedure;
-import org.apache.mahout.math.list.${keyTypeCap}ArrayList;
-import java.util.Arrays;
-import java.nio.IntBuffer;
-
-public abstract class Abstract${keyTypeCap}Set extends AbstractSet {
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- public boolean contains(final ${keyType} key) {
- return !forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} iterKey) {
- return (key != iterKey);
- }
- }
- );
- }
-
- /**
- * Returns a deep copy of the receiver; uses <code>clone()</code> and casts the result.
- *
- * @return a deep copy of the receiver.
- */
- public Abstract${keyTypeCap}Set copy() {
- return (Abstract${keyTypeCap}Set) clone();
- }
-
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof Abstract${keyTypeCap}Set)) {
- return false;
- }
- final Abstract${keyTypeCap}Set other = (Abstract${keyTypeCap}Set) obj;
- if (other.size() != size()) {
- return false;
- }
-
- return
- forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- return other.contains(key);
- }
- }
- );
- }
-
- public int hashCode() {
- final int[] buf = new int[size()];
- forEachKey(
- new ${keyTypeCap}Procedure() {
- int i = 0;
-
- @Override
- public boolean apply(${keyType} iterKey) {
- buf[i++] = HashUtils.hash(iterKey);
- return true;
- }
- }
- );
- Arrays.sort(buf);
- return IntBuffer.wrap(buf).hashCode();
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- public abstract boolean forEachKey(${keyTypeCap}Procedure procedure);
-
- /**
- * Returns a list filled with all keys contained in the receiver. The returned list has a size that equals
- * <tt>this.size()</tt>. Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(${keyTypeCap}Procedure)}. This method can be used to iterate over the keys of the receiver.
- *
- * @return the keys.
- */
- public ${keyTypeCap}ArrayList keys() {
- ${keyTypeCap}ArrayList list = new ${keyTypeCap}ArrayList(size());
- keys(list);
- return list;
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- * This method can be used to
- * iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- public void keys(final ${keyTypeCap}ArrayList list) {
- list.clear();
- forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- list.add(key);
- return true;
- }
- }
- );
- }
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- public abstract boolean add(${keyType} key);
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- public abstract boolean remove(${keyType} key);
-
- /**
- * Returns a string representation of the receiver, containing the String representation of each key-value pair,
- * sorted ascending by key.
- */
- public String toString() {
- ${keyTypeCap}ArrayList theKeys = keys();
- //theKeys.sort();
-
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = theKeys.size() - 1;
- for (int i = 0; i <= maxIndex; i++) {
- ${keyType} key = theKeys.get(i);
- buf.append(String.valueOf(key));
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t b/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t
deleted file mode 100644
index 8c4c0f0..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t
+++ /dev/null
@@ -1,423 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.set;
-
-import java.util.Arrays;
-
-import org.apache.mahout.math.function.${keyTypeCap}Procedure;
-import org.apache.mahout.math.list.${keyTypeCap}ArrayList;
-import org.apache.mahout.math.map.HashFunctions;
-import org.apache.mahout.math.map.PrimeFinder;
-
-/**
- * Open hash set of ${keyType} items;
- **/
-public class Open${keyTypeCap}HashSet extends Abstract${keyTypeCap}Set {
- protected static final byte FREE = 0;
- protected static final byte FULL = 1;
- protected static final byte REMOVED = 2;
-#if (${keyTypeFloating} == 'true')
-#set ($noKeyComment = "${keyTypeCap}.NaN")
- protected static final ${keyType} NO_KEY_VALUE = ${keyTypeCap}.NaN;
-#else
-#set ($noKeyComment = "0")
- protected static final ${keyType} NO_KEY_VALUE = 0;
-#end
-
- /** The hash table keys. */
- private ${keyType}[] table;
-
- /** The state of each hash table entry (FREE, FULL, REMOVED). */
- private byte[] state;
-
- /** The number of table entries in state==FREE. */
- private int freeEntries;
-
-
- /** Constructs an empty map with default capacity and default load factors. */
- public Open${keyTypeCap}HashSet() {
- this(DEFAULT_CAPACITY);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and default load factors.
- *
- * @param initialCapacity the initial capacity of the map.
- * @throws IllegalArgumentException if the initial capacity is less than zero.
- */
- public Open${keyTypeCap}HashSet(int initialCapacity) {
- this(initialCapacity, DEFAULT_MIN_LOAD_FACTOR, DEFAULT_MAX_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
- *
- * @param initialCapacity the initial capacity.
- * @param minLoadFactor the minimum load factor.
- * @param maxLoadFactor the maximum load factor.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- public Open${keyTypeCap}HashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- setUp(initialCapacity, minLoadFactor, maxLoadFactor);
- }
-
- /** Removes all values associations from the receiver. Implicitly calls <tt>trimToSize()</tt>. */
- @Override
- public void clear() {
- Arrays.fill(this.state, FREE);
- distinct = 0;
- freeEntries = table.length; // delta
- trimToSize();
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @Override
- public Object clone() {
- Open${keyTypeCap}HashSet copy = (Open${keyTypeCap}HashSet) super.clone();
- copy.table = copy.table.clone();
- copy.state = copy.state.clone();
- return copy;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- @Override
- public boolean contains(${keyType} key) {
- return indexOfKey(key) >= 0;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
- * method never need be called; it is for performance tuning only. Calling this method before <tt>add()</tt>ing a
- * large number of associations boosts performance, because the receiver will grow only once instead of potentially
- * many times and hash collisions get less probable.
- *
- * @param minCapacity the desired minimum capacity.
- */
- @Override
- public void ensureCapacity(int minCapacity) {
- if (table.length < minCapacity) {
- int newCapacity = nextPrime(minCapacity);
- rehash(newCapacity);
- }
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @Override
- public boolean forEachKey(${keyTypeCap}Procedure procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL) {
- if (!procedure.apply(table[i])) {
- return false;
- }
- }
- }
- return true;
- }
-
- /**
- * @param key the key to be added to the receiver.
- * @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
- * key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
- * at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
- * slot index.
- */
- protected int indexOfInsertion(${keyType} key) {
- final int length = table.length;
-
- final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a removed or free slot, or if we find the key itself
- // do NOT skip over removed slots (yes, open addressing is like that...)
- while (state[i] == FULL && table[i] != key) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == REMOVED) {
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- // assertion: there is at least one FREE slot.
- final int j = i;
- while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
- if (state[i] == FREE) {
- i = j;
- }
- }
-
-
- if (state[i] == FULL) {
- // key already contained at slot i.
- // return a negative number identifying the slot.
- return -i - 1;
- }
- // not already contained, should be inserted at slot i.
- // return a number >= 0 identifying the slot.
- return i;
- }
-
- /**
- * @param key the key to be searched in the receiver.
- * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
- */
- protected int indexOfKey(${keyType} key) {
- final int length = table.length;
-
- final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == FREE) {
- return -1;
- } // not found
- return i; //found, return index where key is contained
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- * This method can be used
- * to iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @Override
- public void keys(${keyTypeCap}ArrayList list) {
- list.setSize(distinct);
- ${keyType} [] elements = list.elements();
-
- int j = 0;
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL) {
- elements[j++] = table[i];
- }
- }
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- @Override
- public boolean add(${keyType} key) {
- int i = indexOfInsertion(key);
- if (i < 0) { //already contained
- //i = -i - 1;
- return false;
- }
-
- if (this.distinct > this.highWaterMark) {
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return add(key);
- }
-
- this.table[i] = key;
- if (this.state[i] == FREE) {
- this.freeEntries--;
- }
- this.state[i] = FULL;
- this.distinct++;
-
- if (this.freeEntries < 1) { //delta
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
- * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
- * mark.
- */
- protected void rehash(int newCapacity) {
- int oldCapacity = table.length;
- //if (oldCapacity == newCapacity) return;
-
- ${keyType}[] oldTable = table;
- byte[] oldState = state;
-
- this.table = new ${keyType}[newCapacity];
- this.state = new byte[newCapacity];
-
- this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
- this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
-
- this.freeEntries = newCapacity - this.distinct; // delta
-
- for (int i = oldCapacity; i-- > 0;) {
- if (oldState[i] == FULL) {
- ${keyType} element = oldTable[i];
- int index = indexOfInsertion(element);
- this.table[index] = element;
- this.state[index] = FULL;
- }
- }
- }
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- @Override
- public boolean remove(${keyType} key) {
- int i = indexOfKey(key);
- if (i < 0) {
- return false;
- } // key not contained
-
- this.state[i] = REMOVED;
- this.distinct--;
-
- if (this.distinct < this.lowWaterMark) {
- int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Initializes the receiver.
- *
- * @param initialCapacity the initial capacity of the receiver.
- * @param minLoadFactor the minLoadFactor of the receiver.
- * @param maxLoadFactor the maxLoadFactor of the receiver.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- @Override
- final protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- int capacity = initialCapacity;
- super.setUp(capacity, minLoadFactor, maxLoadFactor);
- capacity = nextPrime(capacity);
- if (capacity == 0) {
- capacity = 1;
- } // open addressing needs at least one FREE slot at any time.
-
- this.table = new ${keyType}[capacity];
- this.state = new byte[capacity];
-
- // memory will be exhausted long before this pathological case happens, anyway.
- this.minLoadFactor = minLoadFactor;
- if (capacity == PrimeFinder.LARGEST_PRIME) {
- this.maxLoadFactor = 1.0;
- } else {
- this.maxLoadFactor = maxLoadFactor;
- }
-
- this.distinct = 0;
- this.freeEntries = capacity; // delta
-
- // lowWaterMark will be established upon first expansion.
- // establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
- // After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
- // See ensureCapacity(...)
- this.lowWaterMark = 0;
- this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- // * 1.2 because open addressing's performance exponentially degrades beyond that point
- // so that even rehashing the table can take very long
- int newCapacity = nextPrime((int) (1 + 1.2 * size()));
- if (table.length > newCapacity) {
- rehash(newCapacity);
- }
- }
-
- /**
- * Access for unit tests.
- * @param capacity
- * @param minLoadFactor
- * @param maxLoadFactor
- */
- protected void getInternalFactors(int[] capacity,
- double[] minLoadFactor,
- double[] maxLoadFactor) {
- capacity[0] = table.length;
- minLoadFactor[0] = this.minLoadFactor;
- maxLoadFactor[0] = this.maxLoadFactor;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/collections/Arithmetic.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/collections/Arithmetic.java b/math/src/main/java/org/apache/mahout/collections/Arithmetic.java
deleted file mode 100644
index 18e3200..0000000
--- a/math/src/main/java/org/apache/mahout/collections/Arithmetic.java
+++ /dev/null
@@ -1,489 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.collections;
-
-/**
- * Arithmetic functions.
- */
-public final class Arithmetic extends Constants {
- // for method STIRLING_CORRECTION(...)
- private static final double[] STIRLING_CORRECTION = {
- 0.0,
- 8.106146679532726e-02, 4.134069595540929e-02,
- 2.767792568499834e-02, 2.079067210376509e-02,
- 1.664469118982119e-02, 1.387612882307075e-02,
- 1.189670994589177e-02, 1.041126526197209e-02,
- 9.255462182712733e-03, 8.330563433362871e-03,
- 7.573675487951841e-03, 6.942840107209530e-03,
- 6.408994188004207e-03, 5.951370112758848e-03,
- 5.554733551962801e-03, 5.207655919609640e-03,
- 4.901395948434738e-03, 4.629153749334029e-03,
- 4.385560249232324e-03, 4.166319691996922e-03,
- 3.967954218640860e-03, 3.787618068444430e-03,
- 3.622960224683090e-03, 3.472021382978770e-03,
- 3.333155636728090e-03, 3.204970228055040e-03,
- 3.086278682608780e-03, 2.976063983550410e-03,
- 2.873449362352470e-03, 2.777674929752690e-03,
- };
-
- // for method logFactorial(...)
- // log(k!) for k = 0, ..., 29
- private static final double[] LOG_FACTORIALS = {
- 0.00000000000000000, 0.00000000000000000, 0.69314718055994531,
- 1.79175946922805500, 3.17805383034794562, 4.78749174278204599,
- 6.57925121201010100, 8.52516136106541430, 10.60460290274525023,
- 12.80182748008146961, 15.10441257307551530, 17.50230784587388584,
- 19.98721449566188615, 22.55216385312342289, 25.19122118273868150,
- 27.89927138384089157, 30.67186010608067280, 33.50507345013688888,
- 36.39544520803305358, 39.33988418719949404, 42.33561646075348503,
- 45.38013889847690803, 48.47118135183522388, 51.60667556776437357,
- 54.78472939811231919, 58.00360522298051994, 61.26170176100200198,
- 64.55753862700633106, 67.88974313718153498, 71.25703896716800901
- };
-
- // k! for k = 0, ..., 20
- private static final long[] LONG_FACTORIALS = {
- 1L,
- 1L,
- 2L,
- 6L,
- 24L,
- 120L,
- 720L,
- 5040L,
- 40320L,
- 362880L,
- 3628800L,
- 39916800L,
- 479001600L,
- 6227020800L,
- 87178291200L,
- 1307674368000L,
- 20922789888000L,
- 355687428096000L,
- 6402373705728000L,
- 121645100408832000L,
- 2432902008176640000L
- };
-
- // k! for k = 21, ..., 170
- private static final double[] DOUBLE_FACTORIALS = {
- 5.109094217170944E19,
- 1.1240007277776077E21,
- 2.585201673888498E22,
- 6.204484017332394E23,
- 1.5511210043330984E25,
- 4.032914611266057E26,
- 1.0888869450418352E28,
- 3.048883446117138E29,
- 8.841761993739701E30,
- 2.652528598121911E32,
- 8.222838654177924E33,
- 2.6313083693369355E35,
- 8.68331761881189E36,
- 2.952327990396041E38,
- 1.0333147966386144E40,
- 3.719933267899013E41,
- 1.3763753091226346E43,
- 5.23022617466601E44,
- 2.0397882081197447E46,
- 8.15915283247898E47,
- 3.34525266131638E49,
- 1.4050061177528801E51,
- 6.041526306337384E52,
- 2.6582715747884495E54,
- 1.196222208654802E56,
- 5.502622159812089E57,
- 2.5862324151116827E59,
- 1.2413915592536068E61,
- 6.082818640342679E62,
- 3.0414093201713376E64,
- 1.5511187532873816E66,
- 8.06581751709439E67,
- 4.274883284060024E69,
- 2.308436973392413E71,
- 1.2696403353658264E73,
- 7.109985878048632E74,
- 4.052691950487723E76,
- 2.350561331282879E78,
- 1.386831185456898E80,
- 8.32098711274139E81,
- 5.075802138772246E83,
- 3.146997326038794E85,
- 1.9826083154044396E87,
- 1.2688693218588414E89,
- 8.247650592082472E90,
- 5.443449390774432E92,
- 3.6471110918188705E94,
- 2.48003554243683E96,
- 1.7112245242814127E98,
- 1.1978571669969892E100,
- 8.504785885678624E101,
- 6.123445837688612E103,
- 4.470115461512686E105,
- 3.307885441519387E107,
- 2.4809140811395404E109,
- 1.8854947016660506E111,
- 1.451830920282859E113,
- 1.1324281178206295E115,
- 8.94618213078298E116,
- 7.15694570462638E118,
- 5.797126020747369E120,
- 4.7536433370128435E122,
- 3.94552396972066E124,
- 3.314240134565354E126,
- 2.8171041143805494E128,
- 2.4227095383672744E130,
- 2.107757298379527E132,
- 1.854826422573984E134,
- 1.6507955160908465E136,
- 1.4857159644817605E138,
- 1.3520015276784033E140,
- 1.2438414054641305E142,
- 1.156772507081641E144,
- 1.0873661566567426E146,
- 1.0329978488239061E148,
- 9.916779348709491E149,
- 9.619275968248216E151,
- 9.426890448883248E153,
- 9.332621544394415E155,
- 9.332621544394418E157,
- 9.42594775983836E159,
- 9.614466715035125E161,
- 9.902900716486178E163,
- 1.0299016745145631E166,
- 1.0813967582402912E168,
- 1.1462805637347086E170,
- 1.2265202031961373E172,
- 1.324641819451829E174,
- 1.4438595832024942E176,
- 1.5882455415227423E178,
- 1.7629525510902457E180,
- 1.974506857221075E182,
- 2.2311927486598138E184,
- 2.543559733472186E186,
- 2.925093693493014E188,
- 3.393108684451899E190,
- 3.96993716080872E192,
- 4.6845258497542896E194,
- 5.574585761207606E196,
- 6.689502913449135E198,
- 8.094298525273444E200,
- 9.875044200833601E202,
- 1.2146304367025332E205,
- 1.506141741511141E207,
- 1.882677176888926E209,
- 2.3721732428800483E211,
- 3.0126600184576624E213,
- 3.856204823625808E215,
- 4.974504222477287E217,
- 6.466855489220473E219,
- 8.471580690878813E221,
- 1.1182486511960037E224,
- 1.4872707060906847E226,
- 1.99294274616152E228,
- 2.690472707318049E230,
- 3.6590428819525483E232,
- 5.0128887482749884E234,
- 6.917786472619482E236,
- 9.615723196941089E238,
- 1.3462012475717523E241,
- 1.8981437590761713E243,
- 2.6953641378881633E245,
- 3.8543707171800694E247,
- 5.550293832739308E249,
- 8.047926057471989E251,
- 1.1749972043909107E254,
- 1.72724589045464E256,
- 2.5563239178728637E258,
- 3.8089226376305687E260,
- 5.7133839564458575E262,
- 8.627209774233244E264,
- 1.3113358856834527E267,
- 2.0063439050956838E269,
- 3.0897696138473515E271,
- 4.789142901463393E273,
- 7.471062926282892E275,
- 1.1729568794264134E278,
- 1.8532718694937346E280,
- 2.946702272495036E282,
- 4.714723635992061E284,
- 7.590705053947223E286,
- 1.2296942187394494E289,
- 2.0044015765453032E291,
- 3.287218585534299E293,
- 5.423910666131583E295,
- 9.003691705778434E297,
- 1.5036165148649983E300,
- 2.5260757449731988E302,
- 4.2690680090047056E304,
- 7.257415615308004E306
- };
-
- /** Makes this class non instantiable, but still let's others inherit from it. */
- Arithmetic() {
- }
-
- /**
- * Efficiently returns the binomial coefficient, often also referred to as
- * "n over k" or "n choose k". The binomial coefficient is defined as
- * <tt>(n * n-1 * ... * n-k+1 ) / ( 1 * 2 * ... * k )</tt>.
- * <ul> <li><tt>k<0</tt>: <tt>0</tt>.</li>
- * <li><tt>k==0</tt>: <tt>1</tt>.</li>
- * <li><tt>k==1</tt>: <tt>n</tt>.</li>
- * <li>else: <tt>(n * n-1 * ... * n-k+1 ) / ( 1 * 2 * ... * k)</tt>.</li>
- * </ul>
- *
- * @param n
- * @param k
- * @return the binomial coefficient.
- */
- public static double binomial(double n, long k) {
- if (k < 0) {
- return 0;
- }
- if (k == 0) {
- return 1;
- }
- if (k == 1) {
- return n;
- }
-
- // binomial(n,k) = (n * n-1 * ... * n-k+1 ) / ( 1 * 2 * ... * k )
- double a = n - k + 1;
- double b = 1;
- double binomial = 1;
- for (long i = k; i-- > 0;) {
- binomial *= (a++) / (b++);
- }
- return binomial;
- }
-
- /**
- * Efficiently returns the binomial coefficient, often also referred to as "n over k" or "n choose k". The binomial
- * coefficient is defined as <ul> <li><tt>k<0</tt>: <tt>0</tt>. <li><tt>k==0 || k==n</tt>: <tt>1</tt>. <li><tt>k==1 || k==n-1</tt>:
- * <tt>n</tt>. <li>else: <tt>(n * n-1 * ... * n-k+1 ) / ( 1 * 2 * ... * k )</tt>. </ul>
- *
- * @return the binomial coefficient.
- */
- public static double binomial(long n, long k) {
- if (k < 0) {
- return 0;
- }
- if (k == 0 || k == n) {
- return 1;
- }
- if (k == 1 || k == n - 1) {
- return n;
- }
-
- // try quick version and see whether we get numeric overflows.
- // factorial(..) is O(1); requires no loop; only a table lookup.
- if (n > k) {
- int max = LONG_FACTORIALS.length + DOUBLE_FACTORIALS.length;
- if (n < max) { // if (n! < inf && k! < inf)
- double n_fac = factorial((int) n);
- double k_fac = factorial((int) k);
- double n_minus_k_fac = factorial((int) (n - k));
- double nk = n_minus_k_fac * k_fac;
- if (nk != Double.POSITIVE_INFINITY) { // no numeric overflow?
- // now this is completely safe and accurate
- return n_fac / nk;
- }
- }
- if (k > n / 2) {
- k = n - k;
- } // quicker
- }
-
- // binomial(n,k) = (n * n-1 * ... * n-k+1 ) / ( 1 * 2 * ... * k )
- long a = n - k + 1;
- long b = 1;
- double binomial = 1;
- for (long i = k; i-- > 0;) {
- binomial *= (double) a++ / (b++);
- }
- return binomial;
- }
-
- /**
- * Returns the smallest <code>long >= value</code>.
- * <dl><dt>Examples: {@code 1.0 -> 1, 1.2 -> 2, 1.9 -> 2}. This
- * method is safer than using (long) Math.ceil(value), because of possible rounding error.</dt></dl>
- */
- public static long ceil(double value) {
- return Math.round(Math.ceil(value));
- }
-
- /**
- * Evaluates the series of Chebyshev polynomials Ti at argument x/2. The series is given by
- * <pre>
- * N-1
- * - '
- * y = > coef[i] T (x/2)
- * - i
- * i=0
- * </pre>
- * Coefficients are stored in reverse order, i.e. the zero order term is last in the array. Note N is the number of
- * coefficients, not the order. If coefficients are for the interval a to b, x must have been transformed to x -<
- * 2(2x - b - a)/(b-a) before entering the routine. This maps x from (a, b) to (-1, 1), over which the Chebyshev
- * polynomials are defined. If the coefficients are for the inverted interval, in which (a, b) is mapped to (1/b,
- * 1/a), the transformation required is {@code x -> 2(2ab/x - b - a)/(b-a)}. If b is infinity, this becomes {@code x -> 4a/x - 1}.
- * SPEED: Taking advantage of the recurrence properties of the Chebyshev polynomials, the routine requires one
- * more addition per loop than evaluating a nested polynomial of the same degree.
- *
- * @param x argument to the polynomial.
- * @param coef the coefficients of the polynomial.
- * @param N the number of coefficients.
- */
- public static double chbevl(double x, double[] coef, int N) {
-
- int p = 0;
-
- double b0 = coef[p++];
- double b1 = 0.0;
- int i = N - 1;
-
- double b2;
- do {
- b2 = b1;
- b1 = b0;
- b0 = x * b1 - b2 + coef[p++];
- } while (--i > 0);
-
- return 0.5 * (b0 - b2);
- }
-
- /**
- * Instantly returns the factorial <tt>k!</tt>.
- *
- * @param k must hold <tt>k >= 0</tt>.
- */
- private static double factorial(int k) {
- if (k < 0) {
- throw new IllegalArgumentException();
- }
-
- int length1 = LONG_FACTORIALS.length;
- if (k < length1) {
- return LONG_FACTORIALS[k];
- }
-
- int length2 = DOUBLE_FACTORIALS.length;
- if (k < length1 + length2) {
- return DOUBLE_FACTORIALS[k - length1];
- } else {
- return Double.POSITIVE_INFINITY;
- }
- }
-
- /**
- * Returns the largest <code>long <= value</code>.
- * <dl><dt>Examples: {@code 1.0 -> 1, 1.2 -> 1, 1.9 -> 1 <dt> 2.0 -> 2, 2.2 -> 2, 2.9 -> 2}</dt></dl>
- * This method is safer than using (long) Math.floor(value), because of possible rounding error.
- */
- public static long floor(double value) {
- return Math.round(Math.floor(value));
- }
-
- /** Returns <tt>logbasevalue</tt>. */
- public static double log(double base, double value) {
- return Math.log(value) / Math.log(base);
- }
-
- /** Returns <tt>log10value</tt>. */
- public static double log10(double value) {
- // 1.0 / Math.log(10) == 0.43429448190325176
- return Math.log(value) * 0.43429448190325176;
- }
-
- /** Returns <tt>log2value</tt>. */
- public static double log2(double value) {
- // 1.0 / Math.log(2) == 1.4426950408889634
- return Math.log(value) * 1.4426950408889634;
- }
-
- /**
- * Returns <tt>log(k!)</tt>. Tries to avoid overflows. For <tt>k<30</tt> simply looks up a table in O(1). For
- * <tt>k>=30</tt> uses stirlings approximation.
- *
- * @param k must hold <tt>k >= 0</tt>.
- */
- public static double logFactorial(int k) {
- if (k >= 30) {
-
- double r = 1.0 / k;
- double rr = r * r;
- double C7 = -5.95238095238095238e-04;
- double C5 = 7.93650793650793651e-04;
- double C3 = -2.77777777777777778e-03;
- double C1 = 8.33333333333333333e-02;
- double C0 = 9.18938533204672742e-01;
- return (k + 0.5) * Math.log(k) - k + C0 + r * (C1 + rr * (C3 + rr * (C5 + rr * C7)));
- } else {
- return LOG_FACTORIALS[k];
- }
- }
-
- /**
- * Instantly returns the factorial <tt>k!</tt>.
- *
- * @param k must hold {@code k >= 0 && k < 21}
- */
- public static long longFactorial(int k) {
- if (k < 0) {
- throw new IllegalArgumentException("Negative k");
- }
-
- if (k < LONG_FACTORIALS.length) {
- return LONG_FACTORIALS[k];
- }
- throw new IllegalArgumentException("Overflow");
- }
-
- /**
- * Returns the StirlingCorrection. Correction term of the Stirling approximation for <tt>log(k!)</tt> (series in
- * 1/k, or table values for small k) with int parameter k. <tt> log k! = (k + 1/2)log(k + 1) - (k + 1) +
- * (1/2)log(2Pi) + STIRLING_CORRECTION(k + 1) log k! = (k + 1/2)log(k) - k + (1/2)log(2Pi) +
- * STIRLING_CORRECTION(k) </tt>
- */
- public static double stirlingCorrection(int k) {
-
- if (k > 30) {
- double r = 1.0 / k;
- double rr = r * r;
- double C7 = -5.95238095238095238e-04; // -1/1680
- double C5 = 7.93650793650793651e-04; // +1/1260
- double C3 = -2.77777777777777778e-03; // -1/360
- double C1 = 8.33333333333333333e-02; // +1/12
- return r * (C1 + rr * (C3 + rr * (C5 + rr * C7)));
- } else {
- return STIRLING_CORRECTION[k];
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/collections/Constants.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/collections/Constants.java b/math/src/main/java/org/apache/mahout/collections/Constants.java
deleted file mode 100644
index 007bd3f..0000000
--- a/math/src/main/java/org/apache/mahout/collections/Constants.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.collections;
-
-/**
- * Defines some useful constants.
- */
-public class Constants {
- /*
- * machine constants
- */
- protected static final double MACHEP = 1.11022302462515654042E-16;
- protected static final double MAXLOG = 7.09782712893383996732E2;
- protected static final double MINLOG = -7.451332191019412076235E2;
- protected static final double MAXGAM = 171.624376956302725;
- protected static final double SQTPI = 2.50662827463100050242E0;
- protected static final double SQRTH = 7.07106781186547524401E-1;
- protected static final double LOGPI = 1.14472988584940017414;
-
- protected static final double BIG = 4.503599627370496e15;
- protected static final double BIGINV = 2.22044604925031308085e-16;
-
-
- /*
- * MACHEP = 1.38777878078144567553E-17 2**-56
- * MAXLOG = 8.8029691931113054295988E1 log(2**127)
- * MINLOG = -8.872283911167299960540E1 log(2**-128)
- * MAXNUM = 1.701411834604692317316873e38 2**127
- *
- * For IEEE arithmetic (IBMPC):
- * MACHEP = 1.11022302462515654042E-16 2**-53
- * MAXLOG = 7.09782712893383996843E2 log(2**1024)
- * MINLOG = -7.08396418532264106224E2 log(2**-1022)
- * MAXNUM = 1.7976931348623158E308 2**1024
- *
- * The global symbols for mathematical constants are
- * PI = 3.14159265358979323846 pi
- * PIO2 = 1.57079632679489661923 pi/2
- * PIO4 = 7.85398163397448309616E-1 pi/4
- * SQRT2 = 1.41421356237309504880 sqrt(2)
- * SQRTH = 7.07106781186547524401E-1 sqrt(2)/2
- * LOG2E = 1.4426950408889634073599 1/log(2)
- * SQ2OPI = 7.9788456080286535587989E-1 sqrt( 2/pi )
- * LOGE2 = 6.93147180559945309417E-1 log(2)
- * LOGSQ2 = 3.46573590279972654709E-1 log(2)/2
- * THPIO4 = 2.35619449019234492885 3*pi/4
- * TWOOPI = 6.36619772367581343075535E-1 2/pi
- */
- protected Constants() {}
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/common/RandomUtils.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/common/RandomUtils.java b/math/src/main/java/org/apache/mahout/common/RandomUtils.java
deleted file mode 100644
index ba71292..0000000
--- a/math/src/main/java/org/apache/mahout/common/RandomUtils.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.common;
-
-import java.util.Collections;
-import java.util.Map;
-import java.util.Random;
-import java.util.WeakHashMap;
-
-import com.google.common.primitives.Longs;
-import org.apache.commons.math3.primes.Primes;
-
-/**
- * 
- * The source of random stuff for the whole project. This lets us make all randomness in the project
- * predictable, if desired, for when we run unit tests, which should be repeatable.
- * 
- */
-public final class RandomUtils {
-
- /** The largest prime less than 231-1 that is the smaller of a twin prime pair. */
- public static final int MAX_INT_SMALLER_TWIN_PRIME = 2147482949;
-
- private static final Map<RandomWrapper,Boolean> INSTANCES =
- Collections.synchronizedMap(new WeakHashMap<RandomWrapper,Boolean>());
-
- private static boolean testSeed = false;
-
- private RandomUtils() { }
-
- public static void useTestSeed() {
- testSeed = true;
- synchronized (INSTANCES) {
- for (RandomWrapper rng : INSTANCES.keySet()) {
- rng.resetToTestSeed();
- }
- }
- }
-
- public static RandomWrapper getRandom() {
- RandomWrapper random = new RandomWrapper();
- if (testSeed) {
- random.resetToTestSeed();
- }
- INSTANCES.put(random, Boolean.TRUE);
- return random;
- }
-
- public static Random getRandom(long seed) {
- RandomWrapper random = new RandomWrapper(seed);
- INSTANCES.put(random, Boolean.TRUE);
- return random;
- }
-
- /** @return what {@link Double#hashCode()} would return for the same value */
- public static int hashDouble(double value) {
- return Longs.hashCode(Double.doubleToLongBits(value));
- }
-
- /** @return what {@link Float#hashCode()} would return for the same value */
- public static int hashFloat(float value) {
- return Float.floatToIntBits(value);
- }
-
- /**
- * 
- * Finds next-largest "twin primes": numbers p and p+2 such that both are prime. Finds the smallest such p
- * such that the smaller twin, p, is greater than or equal to n. Returns p+2, the larger of the two twins.
- * 
- */
- public static int nextTwinPrime(int n) {
- if (n > MAX_INT_SMALLER_TWIN_PRIME) {
- throw new IllegalArgumentException();
- }
- if (n <= 3) {
- return 5;
- }
- int next = Primes.nextPrime(n);
- while (!Primes.isPrime(next + 2)) {
- next = Primes.nextPrime(next + 4);
- }
- return next + 2;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/common/RandomWrapper.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/common/RandomWrapper.java b/math/src/main/java/org/apache/mahout/common/RandomWrapper.java
deleted file mode 100644
index 802291b..0000000
--- a/math/src/main/java/org/apache/mahout/common/RandomWrapper.java
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.common;
-
-import org.apache.commons.math3.random.MersenneTwister;
-import org.apache.commons.math3.random.RandomGenerator;
-
-import java.util.Random;
-
-public final class RandomWrapper extends Random {
-
- private static final long STANDARD_SEED = 0xCAFEDEADBEEFBABEL;
-
- private final RandomGenerator random;
-
- RandomWrapper() {
- random = new MersenneTwister();
- random.setSeed(System.currentTimeMillis() + System.identityHashCode(random));
- }
-
- RandomWrapper(long seed) {
- random = new MersenneTwister(seed);
- }
-
- @Override
- public void setSeed(long seed) {
- // Since this will be called by the java.util.Random() constructor before we construct
- // the delegate... and because we don't actually care about the result of this for our
- // purpose:
- if (random != null) {
- random.setSeed(seed);
- }
- }
-
- void resetToTestSeed() {
- setSeed(STANDARD_SEED);
- }
-
- public RandomGenerator getRandomGenerator() {
- return random;
- }
-
- @Override
- protected int next(int bits) {
- // Ugh, can't delegate this method -- it's protected
- // Callers can't use it and other methods are delegated, so shouldn't matter
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void nextBytes(byte[] bytes) {
- random.nextBytes(bytes);
- }
-
- @Override
- public int nextInt() {
- return random.nextInt();
- }
-
- @Override
- public int nextInt(int n) {
- return random.nextInt(n);
- }
-
- @Override
- public long nextLong() {
- return random.nextLong();
- }
-
- @Override
- public boolean nextBoolean() {
- return random.nextBoolean();
- }
-
- @Override
- public float nextFloat() {
- return random.nextFloat();
- }
-
- @Override
- public double nextDouble() {
- return random.nextDouble();
- }
-
- @Override
- public double nextGaussian() {
- return random.nextGaussian();
- }
-
-}

r***@apache.org

2018-06-27 14:51:47 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
deleted file mode 100644
index f9bda8a..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math._
-import org.apache.mahout.math.flavor.{BackEnum, TraversingStructureEnum}
-import org.apache.mahout.math.function.Functions
-import RLikeOps._
-import org.apache.mahout.logging._
-import org.apache.mahout.math.backend.incore.MMulSolver
-
-import scala.collection.JavaConversions._
-
-object MMul extends MMulSolver {
-
- private final implicit val log = getLog(MMul.getClass)
-
- override def apply(a: Matrix, b: Matrix, r: Option[Matrix]): Matrix = {
-
- require(a.ncol == b.nrow, "Incompatible matrix sizes in matrix multiplication.")
-
- val (af, bf) = (a.getFlavor, b.getFlavor)
- val backs = (af.getBacking, bf.getBacking)
- val sd = (af.getStructure, densityAnalysis(a), bf.getStructure, densityAnalysis(b))
-
- val alg: MMulAlg = backs match {
-
- // Both operands are jvm memory backs.
- case (BackEnum.JVMMEM, BackEnum.JVMMEM) ⇒
-
- sd match {
-
- // Multiplication cases by a diagonal matrix.
- case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.COLWISE, _)
- if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagCW
- case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSECOLWISE, _)
- if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagCW
- case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.ROWWISE, _)
- if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagRW
- case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSEROWWISE, _)
- if a.isInstanceOf[DiagonalMatrix] ⇒ jvmDiagRW
-
- case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.VECTORBACKED, _)
- if b.isInstanceOf[DiagonalMatrix] ⇒ jvmCWDiag
- case (TraversingStructureEnum.SPARSECOLWISE, _, TraversingStructureEnum.VECTORBACKED, _)
- if b.isInstanceOf[DiagonalMatrix] ⇒ jvmCWDiag
- case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.VECTORBACKED, _)
- if b.isInstanceOf[DiagonalMatrix] ⇒ jvmRWDiag
- case (TraversingStructureEnum.SPARSEROWWISE, _, TraversingStructureEnum.VECTORBACKED, _)
- if b.isInstanceOf[DiagonalMatrix] ⇒ jvmRWDiag
-
- // Dense-dense cases
- case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if a eq b.t ⇒ jvmDRWAAt
- case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if a.t eq b ⇒ jvmDRWAAt
- case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ jvmRWCW
- case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmRWRW
- case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ jvmCWCW
- case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if a eq b.t ⇒ jvmDCWAAt
- case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if a.t eq b ⇒ jvmDCWAAt
- case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmCWRW
-
- // Sparse row matrix x sparse row matrix (array of vectors)
- case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ jvmSparseRWRW
- case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseRWCW
- case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ jvmSparseCWRW
- case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseCWCW
-
- // Sparse matrix x sparse matrix (hashtable of vectors)
- case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
- jvmSparseRowRWRW
- case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
- jvmSparseRowRWCW
- case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
- jvmSparseRowCWRW
- case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
- jvmSparseRowCWCW
-
- // Sparse matrix x non-like
- case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRowRWRW
- case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRowRWCW
- case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRowCWRW
- case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW
- case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ jvmSparseRWRW
- case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRWCW
- case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ jvmSparseCWRW
- case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRowCWCW
-
- // Everything else including at least one sparse LHS or RHS argument
- case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRWRW
- case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRWCW
- case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseCWRW
- case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW2flips
-
- // Sparse methods are only effective if the first argument is sparse, so we need to do a swap.
- case (_, _, _, false) ⇒ (a, b, r) ⇒ apply(b.t, a.t, r.map {_.t}).t
-
- // Default jvm-jvm case.
- case _ ⇒ jvmRWCW
- }
- }
-
- alg(a, b, r)
- }
-
- type MMulAlg = MMBinaryFunc
-
- @inline
- private def jvmRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
-
- require(r.forall(mxR ⇒ mxR.nrow == a.nrow && mxR.ncol == b.ncol))
- val (m, n) = (a.nrow, b.ncol)
-
- val mxR = r.getOrElse(if (densityAnalysis(a)) a.like(m, n) else b.like(m, n))
-
- for (row ← 0 until mxR.nrow; col ← 0 until mxR.ncol) {
- // this vector-vector should be sort of optimized, right?
- mxR(row, col) = a(row, ::) dot b(::, col)
- }
- mxR
- }
-
-
- @inline
- private def jvmRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
-
- // A bit hackish: currently, this relies a bit on the fact that like produces RW(?)
- val bclone = b.like(b.ncol, b.nrow).t
- for (brow ← b) bclone(brow.index(), ::) := brow
-
- require(bclone.getFlavor.getStructure == TraversingStructureEnum.COLWISE || bclone.getFlavor.getStructure ==
- TraversingStructureEnum.SPARSECOLWISE, "COL wise conversion assumption of RHS is wrong, do over this code.")
-
- jvmRWCW(a, bclone, r)
- }
-
- private def jvmCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
- jvmRWRW(b.t, a.t, r.map(_.t)).t
- }
-
- private def jvmCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
- // This is a primary contender with Outer Prod sum algo.
- // Here, we force-reorient both matrices and run RWCW.
- // A bit hackish: currently, this relies a bit on the fact that clone always produces RW(?)
- val aclone = a.cloned
-
- require(aclone.getFlavor.getStructure == TraversingStructureEnum.ROWWISE || aclone.getFlavor.getStructure ==
- TraversingStructureEnum.SPARSEROWWISE, "Row wise conversion assumption of RHS is wrong, do over this code.")
-
- jvmRWRW(aclone, b, r)
- }
-
- private def jvmSparseRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
- val mxR = r.getOrElse(b.like(a.nrow, b.ncol))
-
- // This is basically almost the algorithm from SparseMatrix.times
- for (arow ← a; ael ← arow.nonZeroes)
- mxR(arow.index(), ::).assign(b(ael.index, ::), Functions.plusMult(ael))
-
- mxR
- }
-
- private def jvmSparseRowRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
- val mxR = r.getOrElse(b.like(a.nrow, b.ncol))
- for (arow ← a.iterateNonEmpty(); ael ← arow.vector.nonZeroes)
- mxR(arow.index(), ::).assign(b(ael.index, ::), Functions.plusMult(ael))
-
- mxR
- }
-
- private def jvmSparseRowCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
- jvmSparseRowRWRW(b.t, a.t, r.map(_.t)).t
-
- private def jvmSparseRowCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
- jvmSparseRowRWRW(a cloned, b cloned, r)
-
- private def jvmSparseRowRWCW(a: Matrix, b: Matrix, r: Option[Matrix]) =
- jvmSparseRowRWRW(a, b cloned, r)
-
-
- private def jvmSparseRowCWRW(a: Matrix, b: Matrix, r: Option[Matrix]) =
- jvmSparseRowRWRW(a cloned, b, r)
-
- private def jvmSparseRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
- jvmSparseRWRW(a, b.cloned, r)
-
- private def jvmSparseCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
- jvmSparseRWRW(a cloned, b, r)
-
- private def jvmSparseCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
- jvmSparseRWRW(b.t, a.t, r.map(_.t)).t
-
- private def jvmSparseCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
- jvmSparseRWRW(a cloned, b cloned, r)
-
- private def jvmDiagRW(diagm:Matrix, b:Matrix, r:Option[Matrix] = None):Matrix = {
- val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
-
- for (del ← diagm.diagv.nonZeroes())
- mxR(del.index, ::).assign(b(del.index, ::), Functions.plusMult(del))
-
- mxR
- }
-
- private def jvmDiagCW(diagm: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
- val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
- for (bcol ← b.t) mxR(::, bcol.index()) := bcol * diagm.diagv
- mxR
- }
-
- private def jvmCWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
- jvmDiagRW(diagm, a.t, r.map {_.t}).t
-
- private def jvmRWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
- jvmDiagCW(diagm, a.t, r.map {_.t}).t
-
-
- /** Dense column-wise AA' */
- private def jvmDCWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
- // a.t must be equiv. to b. Cloning must rewrite to row-wise.
- jvmDRWAAt(a.cloned,null,r)
- }
-
- /** Dense Row-wise AA' */
- private def jvmDRWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
- // a.t must be equiv to b.
-
- debug("AAt computation detected.")
-
- // Check dimensions if result is supplied.
- require(r.forall(mxR ⇒ mxR.nrow == a.nrow && mxR.ncol == a.nrow))
-
- val mxR = r.getOrElse(a.like(a.nrow, a.nrow))
-
- // This is symmetric computation. Compile upper triangular first.
- for (row ← 0 until mxR.nrow) {
- // diagonal value
- mxR(row, row) = a(row, ::).aggregate(Functions.PLUS, Functions.SQUARE)
-
- for ( col ← row + 1 until mxR.ncol) {
- // this vector-vector should be sort of optimized, right?
- val v = a(row, ::) dot a(col, ::)
-
- mxR(row, col) = v
- mxR(col,row) = v
- }
- }
-
- mxR
- }
-
- private def jvmOuterProdSum(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
-
- // This may be already laid out for outer product computation, which may be faster than reorienting
- // both matrices? need to check.
- val (m, n) = (a.nrow, b.ncol)
-
- // Prefer col-wise result iff a is dense and b is sparse. In all other cases default to row-wise.
- val preferColWiseR = densityAnalysis(a) && !densityAnalysis(b)
-
- val mxR = r.getOrElse {
- (densityAnalysis(a), preferColWiseR) match {
- case (false, false) ⇒ b.like(m, n)
- case (false, true) ⇒ b.like(n, m).t
- case (true, false) ⇒ a.like(m, n)
- case (true, true) ⇒ a.like(n, m).t
- }
- }
-
- // Loop outer products
- if (preferColWiseR) {
- // this means B is sparse and A is not, so we need to iterate over b values and update R columns with +=
- // one at a time.
- for ((acol, brow) ← a.t.zip(b); bel ← brow.nonZeroes) mxR(::, bel.index()) += bel * acol
- } else {
- for ((acol, brow) ← a.t.zip(b); ael ← acol.nonZeroes()) mxR(ael.index(), ::) += ael * brow
- }
-
- mxR
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MahoutCollections.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MahoutCollections.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MahoutCollections.scala
deleted file mode 100644
index 8251b3a..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MahoutCollections.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.Vector
-
-class MahoutVectorInterfaces(v: Vector) {
- /** Convert to Array[Double] */
- def toArray: Array[Double] = {
- var a = new Array[Double](v.size)
- for (i <- 0 until v.size){
- a(i) = v.get(i)
- }
- a
- }
-
- /** Convert to Map[Int, Double] */
- def toMap: Map[Int, Double] = {
- import collection.JavaConverters._
- val ms = collection.mutable.Map[Int, Double]()
- for (e <- v.nonZeroes().asScala) {
- ms += (e.index -> e.get)
- }
- ms.toMap
- }
-
-}
-
-object MahoutCollections {
- implicit def v2scalaish(v: Vector) = new MahoutVectorInterfaces(v)
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOps.scala
deleted file mode 100644
index 13d80ea..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOps.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.{Vector, Matrix}
-import scala.collection.JavaConversions._
-import RLikeOps._
-
-class MatlabLikeMatrixOps(_m: Matrix) extends MatrixOps(_m) {
-
- /**
- * matrix-matrix multiplication
- * @param that
- * @return
- */
- def *(that: Matrix) = m.times(that)
-
- /**
- * matrix-vector multiplication
- * @param that
- * @return
- */
- def *(that: Vector) = m.times(that)
-
- /**
- * Hadamard product
- *
- * @param that
- * @return
- */
-
- private[math] def *@(that: Matrix) = cloned *= that
-
- private[math] def *@(that: Double) = cloned *= that
-
- /**
- * in-place Hadamard product. We probably don't want to use assign
- * to optimize for sparse operations, in case of Hadamard product
- * it really can be done
- * @param that
- */
- private[math] def *@=(that: Matrix) = {
- m.zip(that).foreach(t => t._1.vector *= t._2.vector)
- m
- }
-
- private[math] def *@=(that: Double) = {
- m.foreach(_.vector() *= that)
- m
- }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeOps.scala
deleted file mode 100644
index 8304af7..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeOps.scala
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.{Vector, MatrixTimesOps, Matrix}
-
-/**
- * Matlab-like operators. Declare <code>import MatlabLikeOps._</code> to enable.
- *
- * (This option is mutually exclusive to other translations such as RLikeOps).
- */
-object MatlabLikeOps {
-
- implicit def v2vOps(v: Vector) = new MatlabLikeVectorOps(v)
-
- implicit def times2timesOps(m: MatrixTimesOps) = new MatlabLikeTimesOps(m)
-
- implicit def m2mOps(m: Matrix) = new MatlabLikeMatrixOps(m)
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeTimesOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeTimesOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeTimesOps.scala
deleted file mode 100644
index 9af179a..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeTimesOps.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.{Matrix, MatrixTimesOps}
-
-class MatlabLikeTimesOps(m: MatrixTimesOps) {
-
- def :*(that: Matrix) = m.timesRight(that)
-
- def *:(that: Matrix) = m.timesLeft(that)
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeVectorOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeVectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeVectorOps.scala
deleted file mode 100644
index ca3573f..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatlabLikeVectorOps.scala
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.Vector
-import org.apache.mahout.math.function.Functions
-import RLikeOps._
-
-/**
- * R-like operators.
- *
- * For now, all element-wise operators are declared private to math package
- * since we are still discussing what is the best approach to have to replace
- * Matlab syntax for elementwise '.*' since it is not directly available for
- * Scala DSL.
- *
- * @param _v
- */
-class MatlabLikeVectorOps(_v: Vector) extends VectorOps(_v) {
-
- /** Elementwise *= */
- private[math] def *@=(that: Vector) = v.assign(that, Functions.MULT)
-
- /** Elementwise /= */
- private[math] def /@=(that: Vector) = v.assign(that, Functions.DIV)
-
- /** Elementwise *= */
- private[math] def *@=(that: Double) = v.assign(Functions.MULT, that)
-
- /** Elementwise /= */
- private[math] def /@=(that: Double) = v.assign(Functions.DIV, that)
-
- /** Elementwise right-associative /= */
- private[math] def /@=:(that: Double) = v.assign(Functions.INV).assign(Functions.MULT, that)
-
- /** Elementwise right-associative /= */
- private[math] def /@=:(that: Vector) = v.assign(Functions.INV).assign(that, Functions.MULT)
-
- /** Elementwise * */
- private[math] def *@(that: Vector) = cloned *= that
-
- /** Elementwise * */
- private[math] def *@(that: Double) = cloned *= that
-
- /** Elementwise / */
- private[math] def /@(that: Vector) = cloned /= that
-
- /** Elementwise / */
- private[math] def /@(that: Double) = cloned /= that
-
- /** Elementwise right-associative / */
- private[math] def /@:(that: Double) = that /=: v.cloned
-
- /** Elementwise right-associative / */
- private[math] def /@:(that: Vector) = that.cloned /= v
-
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
deleted file mode 100644
index f3be285..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.flavor.TraversingStructureEnum
-import org.apache.mahout.math.function.{DoubleFunction, Functions, VectorFunction}
-import org.apache.mahout.math.{Matrices, Matrix, QRDecomposition, Vector}
-
-import scala.collection.JavaConversions._
-import scala.collection._
-import scala.math._
-
-class MatrixOps(val m: Matrix) {
-
- import MatrixOps._
-
- // We need this for some functions below (but it would screw some functions above)
- import RLikeOps.v2vOps
-
- def nrow = m.rowSize()
-
- def ncol = m.columnSize()
-
- /**
- * Warning: this creates a clone (as in mx * -1), in many applications inplace inversion `mx *= -1`
- * might be an infinitely better choice.
- */
- def unary_- = cloned.assign(Functions.NEGATE)
-
- def +=(that: Matrix) = m.assign(that, Functions.PLUS)
-
- def +=:(that:Matrix) = m += that
-
- def +=:(that:Double) = m += that
-
- def -=(that: Matrix) = m.assign(that, Functions.MINUS)
-
- def +=(that: Double) = m.assign(new DoubleFunction {
- def apply(x: Double): Double = x + that
- })
-
- def -=(that: Double) = +=(-that)
-
- def -=:(that: Double) = m.assign(Functions.minus(that))
-
- /** A := B - A which is -(A - B) */
- def -=:(that: Matrix) = m.assign(that, Functions.chain(Functions.NEGATE, Functions.MINUS))
-
- def +(that: Matrix) = cloned += that
-
- def -(that: Matrix) = cloned -= that
-
- def -:(that: Matrix) = that - m
-
- // m.plus(that)?
-
- def +(that: Double) = cloned += that
-
- def +:(that:Double) = cloned += that
-
- def -(that: Double) = cloned -= that
-
- def -:(that: Double) = that -=: cloned
-
- def norm = math.sqrt(m.aggregate(Functions.PLUS, Functions.SQUARE))
-
- def pnorm(p: Int) = pow(m.aggregate(Functions.PLUS, Functions.chain(Functions.ABS, Functions.pow(p))), 1.0 / p)
-
- def apply(row: Int, col: Int) = m.get(row, col)
-
- def update(row: Int, col: Int, that: Double): Matrix = {
- m.setQuick(row, col, that)
- m
- }
-
- def update(rowRange: Range, colRange: Range, that: Double) = apply(rowRange, colRange) := that
-
- def update(row: Int, colRange: Range, that: Double) = apply(row, colRange) := that
-
- def update(rowRange: Range, col: Int, that: Double) = apply(rowRange, col) := that
-
- def update(rowRange: Range, colRange: Range, that: Matrix) = apply(rowRange, colRange) := that
-
- def update(row: Int, colRange: Range, that: Vector) = apply(row, colRange) := that
-
- def update(rowRange: Range, col: Int, that: Vector) = apply(rowRange, col) := that
-
-
- def apply(rowRange: Range, colRange: Range): Matrix = {
-
- if (rowRange == :: &&
- colRange == ::) return m
-
- val rr = if (rowRange == ::) 0 until m.nrow
- else rowRange
- val cr = if (colRange == ::) 0 until m.ncol
- else colRange
-
- m.viewPart(rr.start, rr.length, cr.start, cr.length)
-
- }
-
- def apply(row: Int, colRange: Range): Vector = {
- var r = m.viewRow(row)
- if (colRange != ::) r = r.viewPart(colRange.start, colRange.length)
- r
- }
-
- def apply(rowRange: Range, col: Int): Vector = {
- var c = m.viewColumn(col)
- if (rowRange != ::) c = c.viewPart(rowRange.start, rowRange.length)
- c
- }
-
- /**
- * Apply a function element-wise without side-effects to the argument (creates a new matrix).
- *
- * @param f element-wise function "value" ⇒ "new value"
- * @param evalZeros Do we have to process zero elements? true, false, auto: if auto, we will test
- * the supplied function for `f(0) != 0`, and depending on the result, will
- * decide if we want evaluation for zero elements. WARNING: the AUTO setting
- * may not always work correctly for functions that are meant to run in a specific
- * backend context, or non-deterministic functions, such as {-1,0,1} random
- * generators.
- * @return new DRM with the element-wise function applied.
- */
- def apply(f: Double ⇒ Double, evalZeros: AutoBooleanEnum.T): Matrix = {
- val ezeros = evalZeros match {
- case AutoBooleanEnum.TRUE ⇒ true
- case AutoBooleanEnum.FALSE ⇒ false
- case AutoBooleanEnum.AUTO ⇒ f(0) != 0
- }
- if (ezeros) m.cloned := f else m.cloned ::= f
- }
-
- /**
- * Apply a function element-wise without side-effects to the argument (creates a new matrix).
- *
- * @param f element-wise function (row, column, value) ⇒ "new value"
- * @param evalZeros Do we have to process zero elements? true, false, auto: if auto, we will test
- * the supplied function for `f(0) != 0`, and depending on the result, will
- * decide if we want evaluation for zero elements. WARNING: the AUTO setting
- * may not always work correctly for functions that are meant to run in a specific
- * backend context, or non-deterministic functions, such as {-1,0,1} random
- * generators.
- * @return new DRM with the element-wise function applied.
- */
- def apply(f: (Int, Int, Double) ⇒ Double, evalZeros: AutoBooleanEnum.T): Matrix = {
- val ezeros = evalZeros match {
- case AutoBooleanEnum.TRUE ⇒ true
- case AutoBooleanEnum.FALSE ⇒ false
- case AutoBooleanEnum.AUTO ⇒ f(0,0,0) != 0
- }
- if (ezeros) m.cloned := f else m.cloned ::= f
- }
-
- /** A version of function apply with default AUTO treatment of `evalZeros`. */
- def apply(f: Double ⇒ Double): Matrix = apply(f, AutoBooleanEnum.AUTO)
-
- /** A version of function apply with default AUTO treatment of `evalZeros`. */
- def apply(f: (Int, Int, Double) ⇒ Double): Matrix = apply(f, AutoBooleanEnum.AUTO)
-
-
- /**
- * Warning: This provides read-only view only.
- * In most cases that's what one wants. To get a copy,
- * use <code>m.t cloned</code>
- *
- * @return transposed view
- */
- def t = Matrices.transposedView(m)
-
- def det = m.determinant()
-
- def sum = m.zSum()
-
- def :=(that: Matrix) = m.assign(that)
-
- /**
- * Assigning from a row-wise collection of vectors
- *
- * @param that -
- */
- def :=(that: TraversableOnce[Vector]) = {
- var row = 0
- that.foreach(v => {
- m.assignRow(row, v)
- row += 1
- })
- }
-
- def :=(that: Double) = m.assign(that)
-
- def :=(f: (Int, Int, Double) => Double): Matrix = {
- import RLikeOps._
- m.getFlavor.getStructure match {
- case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
- for (col <- t; el <- col.all) el := f(el.index, col.index, el)
- case default =>
- for (row <- m; el <- row.all) el := f(row.index, el.index, el)
- }
- m
- }
-
- /** Functional assign with (Double) => Double */
- def :=(f: (Double) => Double): Matrix = {
- import RLikeOps._
- m.getFlavor.getStructure match {
- case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
- for (col <- t; el <- col.all) el := f(el)
- case default =>
- for (row <- m; el <- row.all) el := f(el)
- }
- m
- }
-
- /** Sparse assign: iterate and assign over non-zeros only */
- def ::=(f: (Int, Int, Double) => Double): Matrix = {
-
- import RLikeOps._
-
- m.getFlavor.getStructure match {
- case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
- for (col <- t; el <- col.nonZeroes) el := f(el.index, col.index, el)
- case default =>
- for (row <- m; el <- row.nonZeroes) el := f(row.index, el.index, el)
- }
- m
- }
-
- /** Sparse function assign: iterate and assign over non-zeros only */
- def ::=(f: (Double) => Double): Matrix = {
-
- import RLikeOps._
-
- m.getFlavor.getStructure match {
- case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
- for (col <- t; el <- col.nonZeroes) el := f(el)
- case default =>
- for (row <- m; el <- row.nonZeroes) el := f(el)
- }
- m
- }
-
- def cloned: Matrix = m.like := m
-
- /**
- * Ideally, we would probably want to override equals(). But that is not
- * possible without modifying AbstractMatrix implementation in Mahout
- * which would require discussion at Mahout team.
- *
- * @param that
- * @return
- */
- def equiv(that: Matrix) =
-
- // Warning: TODO: This would actually create empty objects in SparseMatrix. Should really implement
- // merge-type comparison strategy using iterateNonEmpty.
- that != null &&
- nrow == that.nrow &&
- m.view.zip(that).forall(t => {
- t._1.equiv(t._2)
- })
-
- def nequiv(that: Matrix) = !equiv(that)
-
- def ===(that: Matrix) = equiv(that)
-
- def !==(that: Matrix) = nequiv(that)
-
- /**
- * test if rank == min(nrow,ncol).
- *
- * @return
- */
- def isFullRank: Boolean =
- new QRDecomposition(if (nrow < ncol) m t else m cloned).hasFullRank
-
- def colSums() = m.aggregateColumns(vectorSumFunc)
-
- def rowSums() = m.aggregateRows(vectorSumFunc)
-
- def colMeans() = if (m.nrow == 0) colSums() else colSums() /= m.nrow
-
- def rowMeans() = if (m.ncol == 0) rowSums() else rowSums() /= m.ncol
-
- /* Diagonal */
- def diagv: Vector = m.viewDiagonal()
-
- /* Diagonal assignment */
- def diagv_=(that: Vector) = diagv := that
-
- /* Diagonal assignment */
- def diagv_=(that: Double) = diagv := that
-
- /* Row and Column non-zero element counts */
- def numNonZeroElementsPerColumn() = m.aggregateColumns(vectorCountNonZeroElementsFunc)
-
- def numNonZeroElementsPerRow() = m.aggregateRows(vectorCountNonZeroElementsFunc)
-}
-
-object MatrixOps {
-
- import RLikeOps.v2vOps
-
- implicit def m2ops(m: Matrix): MatrixOps = new MatrixOps(m)
-
- private def vectorSumFunc = new VectorFunction {
- def apply(f: Vector): Double = f.sum
- }
-
- private def vectorCountNonZeroElementsFunc = new VectorFunction {
- //def apply(f: Vector): Double = f.aggregate(Functions.PLUS, Functions.notEqual(0))
- def apply(f: Vector): Double = f.getNumNonZeroElements().toDouble
- }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
deleted file mode 100644
index a1e9377..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math._
-
-class RLikeDoubleScalarOps(val x:Double) extends AnyVal{
-
- import RLikeOps._
-
- def +(that:Matrix) = that + x
-
- def +(that:Vector) = that + x
-
- def *(that:Matrix) = that * x
-
- def *(that:Vector) = that * x
-
- def -(that:Matrix) = x -: that
-
- def -(that:Vector) = x -: that
-
- def /(that:Matrix) = x /: that
-
- def /(that:Vector) = x /: that
-
- def cbind(that:Matrix) = {
- val mx = that.like(that.nrow, that.ncol + 1)
- mx(::, 1 until mx.ncol) := that
- if (x != 0.0) mx(::, 0) := x
- mx
- }
-
- def rbind(that: Matrix) = {
- val mx = that.like(that.nrow + 1, that.ncol)
- mx(1 until mx.nrow, ::) := that
- if (x != 0.0) mx(0, ::) := x
- mx
- }
-
- def c(that: Vector): Vector = {
- val cv = that.like(that.length + 1)
- cv(1 until cv.length) := that
- cv(0) = x
- cv
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
deleted file mode 100644
index 3ba6ce0..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.function.Functions
-import org.apache.mahout.math.{Matrix, Vector}
-
-import scala.collection.JavaConversions._
-import RLikeOps._
-import org.apache.mahout.math.backend.RootSolverFactory
-import org.apache.mahout.math.scalabindings._
-
-
-class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
-
- /** Structure-optimized mmul */
-
- implicit var solverOperator: opMMulSolver = _
-
- // get the solver matching the implicit variable solverOperator
- def mmulSolver = RootSolverFactory.getOperator
-
- def %*%(that: Matrix) = mmulSolver(m, that, None)
-
- def :%*%(that:Matrix) = %*%(that)
-
- def %*%:(that: Matrix) = that :%*% m
-
- /**
- * The "legacy" matrix-matrix multiplication.
- *
- * @param that right hand operand
- * @return matrix multiplication result
- * @deprecated use %*%
- */
- def %***%(that: Matrix) = m.times(that)
-
- /**
- * matrix-vector multiplication
- * @param that
- * @return
- */
- def %*%(that: Vector) = m.times(that)
-
- /**
- * Hadamard product
- *
- * @param that
- * @return
- */
-
- def *(that: Matrix) = cloned *= that
-
- def *(that: Double) = cloned *= that
-
- def *:(that:Double) = cloned *= that
-
- def /(that: Matrix) = cloned /= that
-
- def /:(that: Matrix) = that / m
-
- def /(that: Double) = cloned /= that
-
- /** 1.0 /: A is eqivalent to R's 1.0/A */
- def /:(that: Double) = that /=: cloned
-
- /**
- * in-place Hadamard product. We probably don't want to use assign
- * to optimize for sparse operations, in case of Hadamard product
- * it really can be done
- * @param that
- */
- def *=(that: Matrix) = {
- m.assign(that, Functions.MULT)
- m
- }
-
- /** A *=: B is equivalent to B *= A. Included for completeness. */
- def *=:(that: Matrix) = m *= that
-
- /** Elementwise deletion */
- def /=(that: Matrix) = {
- m.zip(that).foreach(t ⇒ t._1.vector() /= t._2.vector)
- m
- }
-
- def *=(that: Double) = {
- m.foreach(_.vector() *= that)
- m
- }
-
- /** 5.0 *=: A is equivalent to A *= 5.0. Included for completeness. */
- def *=:(that: Double) = m *= that
-
- def /=(that: Double) = {
- m ::= { x ⇒ x / that }
- m
- }
-
- /** 1.0 /=: A is equivalent to A = 1.0/A in R */
- def /=:(that: Double) = {
- if (that != 0.0) m := { x ⇒ that / x }
- m
- }
-
- def ^=(that: Double) = {
- that match {
- // Special handling of x ^2 and x ^ 0.5: we want consistent handling of x ^ 2 and x * x since
- // pow(x,2) function return results different from x * x; but much of the code uses this
- // interchangeably. Not having this done will create things like NaN entries on main diagonal
- // of a distance matrix.
- case 2.0 ⇒ m ::= { x ⇒ x * x }
- case 0.5 ⇒ m ::= math.sqrt _
- case _ ⇒ m ::= { x ⇒ math.pow(x, that) }
- }
- }
-
- def ^(that: Double) = m.cloned ^= that
-
- def cbind(that: Matrix): Matrix = {
- require(m.nrow == that.nrow)
- if (m.ncol > 0) {
- if (that.ncol > 0) {
- val mx = m.like(m.nrow, m.ncol + that.ncol)
- mx(::, 0 until m.ncol) := m
- mx(::, m.ncol until mx.ncol) := that
- mx
- } else m
- } else that
- }
-
- def cbind(that: Double): Matrix = {
- val mx = m.like(m.nrow, m.ncol + 1)
- mx(::, 0 until m.ncol) := m
- if (that != 0.0) mx(::, m.ncol) := that
- mx
- }
-
- def rbind(that: Matrix): Matrix = {
- require(m.ncol == that.ncol)
- if (m.nrow > 0) {
- if (that.nrow > 0) {
- val mx = m.like(m.nrow + that.nrow, m.ncol)
- mx(0 until m.nrow, ::) := m
- mx(m.nrow until mx.nrow, ::) := that
- mx
- } else m
- } else that
- }
-
- def rbind(that: Double): Matrix = {
- val mx = m.like(m.nrow + 1, m.ncol)
- mx(0 until m.nrow, ::) := m
- if (that != 0.0) mx(m.nrow, ::) := that
- mx
- }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
deleted file mode 100644
index a6f9f5b..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.{Vector, Matrix}
-
-/**
- * R-like operators. Declare <code>import RLikeOps._</code> to enable.
- */
-object RLikeOps {
-
- implicit def double2Scalar(x:Double) = new RLikeDoubleScalarOps(x)
-
- implicit def v2vOps(v: Vector) = new RLikeVectorOps(v)
-
- implicit def el2elOps(el: Vector.Element) = new ElementOps(el)
-
- implicit def el2Double(el:Vector.Element) = el.get()
-
- implicit def m2mOps(m: Matrix) = new RLikeMatrixOps(m)
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
deleted file mode 100644
index 394795f..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.Vector
-import org.apache.mahout.math.function.Functions
-import RLikeOps._
-
-/**
- * R-like operators
- *
- * @param _v
- */
-class RLikeVectorOps(_v: Vector) extends VectorOps(_v) {
-
- /** Elementwise *= */
- def *=(that: Vector) = v.assign(that, Functions.MULT)
-
- def *=:(that:Vector) = *=(that)
-
- /** Elementwise /= */
- def /=(that: Vector) = v.assign(that, Functions.DIV)
-
- /** Elementwise *= */
- def *=(that: Double) = v.assign(Functions.MULT, that)
-
- def *=:(that: Double) = *=(that)
-
- /** Elementwise /= */
- def /=(that: Double) = v.assign(Functions.DIV, that)
-
- /** Elementwise right-associative /= */
- def /=:(that: Double) = v.assign(Functions.INV).assign(Functions.MULT, that)
-
- /** Elementwise right-associative /= */
- def /=:(that: Vector) = v.assign(Functions.INV).assign(that, Functions.MULT)
-
- /** Elementwise * */
- def *(that: Vector) = cloned *= that
-
- /** Elementwise * */
- def *(that: Double) = cloned *= that
-
- /** Elementwise * */
- def *:(that: Double) = cloned *= that
-
- /** Elementwise / */
- def /(that: Vector) = cloned /= that
-
- /** Elementwise / */
- def /(that: Double) = cloned /= that
-
- /** Elementwise right-associative / */
- def /:(that: Double) = that /=: v.cloned
-
- /** Elementwise right-associative / */
- def /:(that: Vector) = that.cloned /= v
-
- def ^=(that: Double) = that match {
- // Special handling of x ^2 and x ^ 0.5: we want consistent handling of x ^ 2 and x * x since
- // pow(x,2) function return results different from x * x; but much of the code uses this
- // interchangeably. Not having this done will create things like NaN entries on main diagonal
- // of a distance matrix.
- case 2.0 ⇒ v.assign(Functions.SQUARE)
- case 0.5 ⇒ v.assign(Functions.SQRT)
- case _ ⇒ v.assign (Functions.POW, that)
- }
-
- def ^=(that: Vector) = v.assign(that, Functions.POW)
-
- def ^(that: Double) = v.cloned ^= that
-
- def ^(that: Vector) = v.cloned ^= that
-
- def c(that: Vector) = {
- if (v.length > 0) {
- if (that.length > 0) {
- val cv = v.like(v.length + that.length)
- cv(0 until v.length) := cv
- cv(v.length until cv.length) := that
- cv
- } else v
- } else that
- }
-
- def c(that: Double) = {
- val cv = v.like(v.length + 1)
- cv(0 until v.length) := v
- cv(v.length) = that
- cv
- }
-
- def mean = sum / length
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
deleted file mode 100644
index 30311b8..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math._
-import scala.collection.JavaConversions._
-import org.apache.mahout.math.function.Functions
-
-/**
- * Syntactic sugar for mahout vectors
- * @param v Mahout vector
- */
-class VectorOps(private[scalabindings] val v: Vector) {
-
- import RLikeOps._
-
- def apply(i: Int) = v.get(i)
-
- def update(i: Int, that: Double) = v.setQuick(i, that)
-
- /** Warning: we only support consecutive views, step is not supported directly */
- def apply(r: Range) = if (r == ::) v else v.viewPart(r.start, r.length * r.step)
-
- def update(r: Range, that: Vector) = apply(r) := that
-
- /** R-like synonyms for java methods on vectors */
- def sum = v.zSum()
-
- def min = v.minValue()
-
- def max = v.maxValue()
-
- def :=(that: Vector): Vector = {
-
- // assign op in Mahout requires same
- // cardinality between vectors .
- // we want to relax it here and require
- // v to have _at least_ as large cardinality
- // as "that".
- if (that.length == v.size())
- v.assign(that)
- else if (that.length < v.size) {
- v.assign(0.0)
- that.nonZeroes().foreach(t => v.setQuick(t.index, t.get))
- v
- } else throw new IllegalArgumentException("Assigner's cardinality less than assignee's")
- }
-
- def :=(that: Double): Vector = v.assign(that)
-
- /** Functional assigment for a function with index and x */
- def :=(f: (Int, Double) => Double): Vector = {
- for (i <- 0 until length) v(i) = f(i, v(i))
- v
- }
-
- /** Functional assignment for a function with just x (e.g. v := math.exp _) */
- def :=(f:(Double)=>Double):Vector = {
- for (i <- 0 until length) v(i) = f(v(i))
- v
- }
-
- /** Sparse iteration functional assignment using function receiving index and x */
- def ::=(f: (Int, Double) => Double): Vector = {
- for (el <- v.nonZeroes) el := f(el.index, el.get)
- v
- }
-
- /** Sparse iteration functional assignment using a function recieving just x */
- def ::=(f: (Double) => Double): Vector = {
- for (el <- v.nonZeroes) el := f(el.get)
- v
- }
-
- def equiv(that: Vector) =
- length == that.length &&
- v.all.view.zip(that.all).forall(t => t._1.get == t._2.get)
-
- def ===(that: Vector) = equiv(that)
-
- def !==(that: Vector) = nequiv(that)
-
- def nequiv(that: Vector) = !equiv(that)
-
- def unary_- = cloned.assign(Functions.NEGATE)
-
- def +=(that: Vector) = v.assign(that, Functions.PLUS)
-
- def +=:(that: Vector) = +=(that)
-
- def -=(that: Vector) = v.assign(that, Functions.MINUS)
-
- def +=(that: Double) = v.assign(Functions.PLUS, that)
-
- def +=:(that: Double) = +=(that)
-
- def -=(that: Double) = +=(-that)
-
- def -=:(that: Vector) = v.assign(Functions.NEGATE).assign(that, Functions.PLUS)
-
- def -=:(that: Double) = v.assign(Functions.NEGATE).assign(Functions.PLUS, that)
-
- def +(that: Vector) = cloned += that
-
- def -(that: Vector) = cloned -= that
-
- def -:(that: Vector) = that.cloned -= v
-
- def +(that: Double) = cloned += that
-
- def +:(that: Double) = cloned += that
-
- def -(that: Double) = cloned -= that
-
- def -:(that: Double) = that -=: v.cloned
-
- def length = v.size()
-
- def cloned: Vector = v.like := v
-
- def sqrt = v.cloned.assign(Functions.SQRT)
-
- /** Convert to a single column matrix */
- def toColMatrix: Matrix = {
- import RLikeOps._
- v match {
-
- case vd: Vector if vd.isDense => dense(vd).t
- case srsv: RandomAccessSparseVector => new SparseColumnMatrix(srsv.length, 1, Array(srsv))
- case _ => sparse(v).t
- }
- }
-
-}
-
-class ElementOps(private[scalabindings] val el: Vector.Element) {
- import RLikeOps._
-
- def update(v: Double): Double = { el.set(v); v }
-
- def :=(that: Double) = update(that)
-
- def *(that: Vector.Element): Double = this * that
-
- def *(that: Vector): Vector = el.get * that
-
- def +(that: Vector.Element): Double = this + that
-
- def +(that: Vector) :Vector = el.get + that
-
- def /(that: Vector.Element): Double = this / that
-
- def /(that:Vector):Vector = el.get / that
-
- def -(that: Vector.Element): Double = this - that
-
- def -(that: Vector) :Vector = el.get - that
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
deleted file mode 100644
index 4115091..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math
-
-import org.apache.mahout.math.solver.EigenDecomposition
-
-import collection._
-import scala.util.Random
-
-/**
- * Mahout matrices and vectors' scala syntactic sugar
- */
-package object scalabindings {
-
-
- // Reserved "ALL" range
- final val `::`: Range = null
-
- // values for stochastic sparsityAnalysis
- final val z95 = 1.959964
- final val z80 = 1.281552
- final val maxSamples = 500
- final val minSamples = 15
-
- // Some enums
- object AutoBooleanEnum extends Enumeration {
- type T = Value
- val TRUE, FALSE, AUTO = Value
- }
-
- implicit def seq2Vector(s: TraversableOnce[AnyVal]) =
- new DenseVector(s.map(_.asInstanceOf[Number].doubleValue()).toArray)
-
- implicit def tuple2TravOnce2svec[V <: AnyVal](sdata: TraversableOnce[(Int, V)]) = svec(sdata)
-
- implicit def t1vec(s: Tuple1[AnyVal]): Vector = prod2Vec(s)
-
- implicit def t2vec(s: Tuple2[AnyVal, AnyVal]): Vector = prod2Vec(s)
-
- implicit def t3vec(s: Tuple3[AnyVal, AnyVal, AnyVal]): Vector = prod2Vec(s)
-
- implicit def t4vec(s: Tuple4[AnyVal, AnyVal, AnyVal, AnyVal]): Vector = prod2Vec(s)
-
- implicit def t5vec(s: Tuple5[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal]): Vector = prod2Vec(s)
-
- implicit def t6vec(s: Tuple6[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal]): Vector = prod2Vec(s)
-
- implicit def t7vec(s: Tuple7[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal]): Vector = prod2Vec(s)
-
- implicit def t8vec(s: Tuple8[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal]): Vector = prod2Vec(s)
-
- implicit def t9vec(s: Tuple9[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal]): Vector =
- prod2Vec(s)
-
- implicit def t10vec(s: Tuple10[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t11vec(s: Tuple11[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t12vec(s: Tuple12[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t13vec(s: Tuple13[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t14vec(s: Tuple14[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t15vec(s: Tuple15[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t16vec(s: Tuple16[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t17vec(s: Tuple17[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t18vec(s: Tuple18[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t19vec(s: Tuple19[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t20vec(s: Tuple20[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t21vec(s: Tuple21[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
- implicit def t22vec(s: Tuple22[AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal
- , AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal, AnyVal])
- : Vector = prod2Vec(s)
-
-
- def prod2Vec(s: Product) = new DenseVector(s.productIterator.
- map(_.asInstanceOf[Number].doubleValue()).toArray)
-
- def diagv(v: Vector): DiagonalMatrix = new DiagonalMatrix(v)
-
- def diag(v: Double, size: Int): DiagonalMatrix =
- new DiagonalMatrix(new DenseVector(Array.fill(size)(v)))
-
- def eye(size: Int) = new DiagonalMatrix(1.0, size)
-
- /**
- * Create dense matrix out of inline arguments -- rows -- which can be tuples,
- * iterables of Double, or just single Number (for columnar vectors)
- * @param rows
- * @tparam R
- * @return
- */
- def dense[R](rows: R*): DenseMatrix = {
- import RLikeOps._
- val data = for (r ← rows) yield {
- r match {
- case n: Number ⇒ Array(n.doubleValue())
- case t: Vector ⇒ Array.tabulate(t.length)(t(_))
- case t: Array[Double] ⇒ t
- case t: Iterable[_] ⇒
- t.head match {
- case ss: Double ⇒ t.asInstanceOf[Iterable[Double]].toArray
- case vv: Vector ⇒
- val m = new DenseMatrix(t.size, t.head.asInstanceOf[Vector].length)
- t.asInstanceOf[Iterable[Vector]].view.zipWithIndex.foreach {
- case (v, idx) ⇒ m(idx, ::) := v
- }
- return m
- }
- case t: Product ⇒ t.productIterator.map(_.asInstanceOf[Number].doubleValue()).toArray
- case t: Array[Array[Double]] ⇒ if (rows.size == 1)
- return new DenseMatrix(t)
- else
- throw new IllegalArgumentException(
- "double[][] data parameter can be the only argument for dense()")
- case t: Array[Vector] ⇒
- val m = new DenseMatrix(t.size, t.head.length)
- t.view.zipWithIndex.foreach {
- case (v, idx) ⇒ m(idx, ::) := v
- }
- return m
- case _ ⇒ throw new IllegalArgumentException("unsupported type in the inline Matrix initializer")
- }
- }
- new DenseMatrix(data.toArray)
- }
-
- /**
- * Default initializes are always row-wise.
- * create a sparse,
- * e.g. {{{
- *
- * m = sparse(
- * (0,5)::(9,3)::Nil,
- * (2,3.5)::(7,8)::Nil
- * )
- *
- * }}}
- *
- * @param rows
- * @return
- */
-
- def sparse(rows: Vector*): SparseRowMatrix = {
- import RLikeOps._
- val nrow = rows.size
- val ncol = rows.map(_.size()).max
- val m = new SparseRowMatrix(nrow, ncol)
- m := rows.map { row ⇒
- if (row.length < ncol) {
- val newRow = row.like(ncol)
- newRow(0 until row.length) := row
- newRow
- }
- else row
- }
- m
-
- }
-
- /**
- * create a sparse vector out of list of tuple2's
- * @param sdata cardinality
- * @return
- */
- def svec(sdata: TraversableOnce[(Int, AnyVal)], cardinality: Int = -1) = {
- val required = if (sdata.nonEmpty) sdata.map(_._1).max + 1 else 0
- var tmp = -1
- if (cardinality < 0) {
- tmp = required
- } else if (cardinality < required) {
- throw new IllegalArgumentException(s"Required cardinality %required but got %cardinality")
- } else {
- tmp = cardinality
- }
- val initialCapacity = sdata.size
- val sv = new RandomAccessSparseVector(tmp, initialCapacity)
- sdata.foreach(t ⇒ sv.setQuick(t._1, t._2.asInstanceOf[Number].doubleValue()))
- sv
- }
-
- def dvec(fromV: Vector) = new DenseVector(fromV)
-
- def dvec(ddata: TraversableOnce[Double]) = new DenseVector(ddata.toArray)
-
- def dvec(numbers: Number*) = new DenseVector(numbers.map(_.doubleValue()).toArray)
-
- def chol(m: Matrix, pivoting: Boolean = false) = new CholeskyDecomposition(m, pivoting)
-
- /**
- * computes SVD
- * @param m svd input
- * @return (U,V, singular-values-vector)
- */
- def svd(m: Matrix) = {
- val svdObj = new SingularValueDecomposition(m)
- (svdObj.getU, svdObj.getV, new DenseVector(svdObj.getSingularValues))
- }
-
- /**
- * Computes Eigendecomposition of a symmetric matrix
- * @param m symmetric input matrix
- * @return (V, eigen-values-vector)
- */
- def eigen(m: Matrix) = {
- val ed = new EigenDecomposition(m, true)
- (ed.getV, ed.getRealEigenvalues)
- }
-
-
- /**
- * More general version of eigen decomposition
- * @param m
- * @param symmetric
- * @return (V, eigenvalues-real-vector, eigenvalues-imaginary-vector)
- */
- def eigenFull(m: Matrix, symmetric: Boolean = true) {
- val ed = new EigenDecomposition(m, symmetric)
- (ed.getV, ed.getRealEigenvalues, ed.getImagEigenvalues)
- }
-
- /**
- * QR.
- *
- * Right now Mahout's QR seems to be using argument for in-place transformations,
- * so the matrix context gets messed after this. Hence we force cloning of the
- * argument before passing it to Mahout's QR so to keep expected semantics.
- * @param m
- * @return (Q,R)
- */
- def qr(m: Matrix) = {
- import MatrixOps._
- val qrdec = new QRDecomposition(m cloned)
- (qrdec.getQ, qrdec.getR)
- }
-
- /**
- * Solution <tt>X</tt> of <tt>A*X = B</tt> using QR-Decomposition, where <tt>A</tt> is a square, non-singular matrix.
- *
- * @param a
- * @param b
- * @return (X)
- */
- def solve(a: Matrix, b: Matrix): Matrix = {
- import MatrixOps._
- if (a.nrow != a.ncol) {
- throw new IllegalArgumentException("supplied matrix A is not square")
- }
- val qr = new QRDecomposition(a cloned)
- if (!qr.hasFullRank) {
- throw new IllegalArgumentException("supplied matrix A is singular")
- }
- qr.solve(b)
- }
-
- /**
- * Solution <tt>A^{-1}</tt> of <tt>A*A^{-1} = I</tt> using QR-Decomposition, where <tt>A</tt> is a square,
- * non-singular matrix. Here only for compatibility with R semantics.
- *
- * @param a
- * @return (A^{-1})
- */
- def solve(a: Matrix): Matrix = {
- import MatrixOps._
- solve(a, eye(a.nrow))
- }
-
- /**
- * Solution <tt>x</tt> of <tt>A*x = b</tt> using QR-Decomposition, where <tt>A</tt> is a square, non-singular matrix.
- *
- * @param a
- * @param b
- * @return (x)
- */
- def solve(a: Matrix, b: Vector): Vector = {
- import RLikeOps._
- val x = solve(a, b.toColMatrix)
- x(::, 0)
- }
-
- ///////////////////////////////////////////////////////////
- // Elementwise unary functions. Actually this requires creating clones to avoid side effects. For
- // efficiency reasons one may want to actually do in-place exression assignments instead, e.g.
- //
- // m := exp _
-
- import RLikeOps._
- import scala.math._
-
- def mexp(m: Matrix): Matrix = m.cloned := exp _
-
- def vexp(v: Vector): Vector = v.cloned := exp _
-
- def mlog(m: Matrix): Matrix = m.cloned := log _
-
- def vlog(v: Vector): Vector = v.cloned := log _
-
- def mabs(m: Matrix): Matrix = m.cloned ::= (abs(_: Double))
-
- def vabs(v: Vector): Vector = v.cloned ::= (abs(_: Double))
-
- def msqrt(m: Matrix): Matrix = m.cloned ::= sqrt _
-
- def vsqrt(v: Vector): Vector = v.cloned ::= sqrt _
-
- def msignum(m: Matrix): Matrix = m.cloned ::= (signum(_: Double))
-
- def vsignum(v: Vector): Vector = v.cloned ::= (signum(_: Double))
-
- //////////////////////////////////////////////////////////
- // operation funcs
-
-
- /** Matrix-matrix unary func */
- type MMUnaryFunc = (Matrix, Option[Matrix]) ⇒ Matrix
- /** Binary matrix-matrix operations which may save result in-place, optionally */
- type MMBinaryFunc = (Matrix, Matrix, Option[Matrix]) ⇒ Matrix
- type MVBinaryFunc = (Matrix, Vector, Option[Matrix]) ⇒ Matrix
- type VMBinaryFunc = (Vector, Matrix, Option[Matrix]) ⇒ Matrix
- type MDBinaryFunc = (Matrix, Double, Option[Matrix]) ⇒ Matrix
-
- trait opMMulSolver extends MMBinaryFunc {
-
- }
-
- /////////////////////////////////////
- // Miscellaneous in-core utilities
-
- /**
- * Compute column-wise means and variances.
- *
- * @return colMeans → colVariances
- */
- def colMeanVars(mxA:Matrix): (Vector, Vector) = {
- val mu = mxA.colMeans()
- val variance = (mxA * mxA colMeans) -= mu ^ 2
- mu → variance
- }
-
- /**
- * Compute column-wise means and stdevs.
- * @param mxA input
- * @return colMeans → colStdevs
- */
- def colMeanStdevs(mxA:Matrix) = {
- val (mu, variance) = colMeanVars(mxA)
- mu → (variance ::= math.sqrt _)
- }
-
- /** Compute square distance matrix. We assume data points are row-wise, similar to R's dist(). */
- def sqDist(mxX: Matrix): Matrix = {
-
- val s = mxX ^ 2 rowSums
-
- (mxX %*% mxX.t) := { (r, c, x) ⇒ s(r) + s(c) - 2 * x}
- }
-
- /**
- * Pairwise squared distance computation.
- * @param mxX X, m x d
- * @param mxY Y, n x d
- * @return pairwise squaired distances of row-wise data points in X and Y (m x n)
- */
- def sqDist(mxX: Matrix, mxY: Matrix): Matrix = {
-
- val s = mxX ^ 2 rowSums
-
- val t = mxY ^ 2 rowSums
-
- // D = s*1' + 1*t' - 2XY'
- (mxX %*% mxY.t) := { (r, c, d) ⇒ s(r) + t(c) - 2.0 * d}
- }
-
- def dist(mxX: Matrix): Matrix = sqDist(mxX) := sqrt _
-
- def dist(mxX: Matrix, mxY: Matrix): Matrix = sqDist(mxX, mxY) := sqrt _
-
- /**
- * Check the density of an in-core matrix based on supplied criteria.
- * Returns true if we think mx is denser than threshold with at least 80% confidence.
- *
- * @param mx The matrix to check density of.
- * @param threshold the threshold of non-zero elements above which we consider a Matrix Dense
- */
- def densityAnalysis(mx: Matrix, threshold: Double = 0.25): Boolean = {
-
- require(threshold >= 0.0 && threshold <= 1.0)
- var n = minSamples
- var mean = 0.0
- val rnd = new Random()
- val dimm = mx.nrow
- val dimn = mx.ncol
- val pq = threshold * (1 - threshold)
-
- for (s ← 0 until minSamples) {
- if (mx(rnd.nextInt(dimm), rnd.nextInt(dimn)) != 0.0) mean += 1
- }
- mean /= minSamples
- val iv = z80 * math.sqrt(pq / n)
-
- if (mean < threshold - iv) return false // sparse
- else if (mean > threshold + iv) return true // dense
-
- while (n < maxSamples) {
- // Determine upper bound we may need for n to likely relinquish the uncertainty. Here, we use
- // confidence interval formula but solved for n.
- val ivNeeded = math.abs(threshold - mean) max 1e-11
-
- val stderr = ivNeeded / z80
- val nNeeded = (math.ceil(pq / (stderr * stderr)).toInt max n min maxSamples) - n
-
- var meanNext = 0.0
- for (s ← 0 until nNeeded) {
- if (mx(rnd.nextInt(dimm), rnd.nextInt(dimn)) != 0.0) meanNext += 1
- }
- mean = (n * mean + meanNext) / (n + nNeeded)
- n += nNeeded
-
- // Are we good now?
- val iv = z80 * math.sqrt(pq / n)
- if (mean < threshold - iv) return false // sparse
- else if (mean > threshold + iv) return true // dense
- }
-
- mean > threshold // if (mean > threshold) dense
-
- }
-
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
deleted file mode 100644
index c75ff20..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.nlp.tfidf
-
-trait TermWeight {
-
- /**
- * @param tf term freq
- * @param df doc freq
- * @param length Length of the document
- * @param numDocs the total number of docs
- */
- def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double
-}
-
-
-class TFIDF extends TermWeight {
-
- /**
- * Calculate TF-IDF weight.
- *
- * Lucene 4.6's DefaultSimilarity TF-IDF calculation uses the formula:
- *
- * sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0)
- *
- * Note: this is consistent with the MapReduce seq2sparse implementation of TF-IDF weights
- * and is slightly different from Spark MLlib's TD-IDF calculation which is implemented as:
- *
- * termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
- *
- * @param tf term freq
- * @param df doc freq
- * @param length Length of the document - UNUSED
- * @param numDocs the total number of docs
- * @return The TF-IDF weight as calculated by Lucene 4.6's DefaultSimilarity
- */
- def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double = {
-
- // Lucene 4.6 DefaultSimilarity's TF-IDF is implemented as:
- // sqrt(tf) * (log(numDocs / (df + 1)) + 1)
- math.sqrt(tf) * (math.log(numDocs / (df + 1).toDouble) + 1.0)
- }
-}
-
-class MLlibTFIDF extends TermWeight {
-
- /**
- * Calculate TF-IDF weight with IDF formula used by Spark MLlib's IDF:
- *
- * termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
- *
- * Use this weight if working with MLLib vectorized documents.
- *
- * Note: this is not consistent with the MapReduce seq2sparse implementation of TF-IDF weights
- * which is implemented using Lucene DefaultSimilarity's TF-IDF calculation:
- *
- * sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0)
- *
- * @param tf term freq
- * @param df doc freq
- * @param length Length of the document - UNUSED
- * @param numDocs the total number of docs
- * @return The TF-IDF weight as calculated by Spark MLlib's IDF
- */
- def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double = {
-
- // Spark MLLib's TF-IDF weight is implemented as:
- // termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
- tf * math.log((numDocs + 1.0) / (df + 1).toDouble)
- }
-}
-
-class TF extends TermWeight {
-
- /**
- * For TF Weight simply return the absolute TF.
- *
- * Note: We do not use Lucene 4.6's DefaultSimilarity's TF calculation here
- * which returns:
- *
- * sqrt(tf)
- *
- * this is consistent with the MapReduce seq2sparse implementation of TF weights.
- *
- * @param tf term freq
- * @param df doc freq - UNUSED
- * @param length Length of the document - UNUSED
- * @param numDocs the total number of docs - UNUSED
- * based on term frequency only - UNUSED
- * @return The weight = tf param
- */
- def calculate(tf: Int, df: Int = -Int.MaxValue, length: Int = -Int.MaxValue, numDocs: Int = -Int.MaxValue): Double = {
- tf
- }
-}
-
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala b/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
deleted file mode 100644
index b61bea4..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.util
-
-import org.apache.mahout.logging._
-import collection._
-import java.io.Closeable
-
-object IOUtilsScala {
-
- private final implicit val log = getLog(IOUtilsScala.getClass)
-
- /**
- * Try to close every resource in the sequence, in order of the sequence.
- *
- * Report all encountered exceptions to logging.
- *
- * Rethrow last exception only (if any)
- * @param closeables
- */
- def close(closeables: Seq[Closeable]) = {
-
- var lastThr: Option[Throwable] = None
- closeables.foreach { c =>
- try {
- c.close()
- } catch {
- case t: Throwable =>
- error(t.getMessage, t)
- lastThr = Some(t)
- }
- }
-
- // Rethrow most recent close exception (can throw only one)
- lastThr.foreach(throw _)
- }
-
- /**
- * Same as [[IOUtilsScala.close( )]] but do not re-throw any exceptions.
- * @param closeables
- */
- def closeQuietly(closeables: Seq[Closeable]) = {
- try {
- close(closeables)
- } catch {
- case t: Throwable => // NOP
- }
- }
-}

r***@apache.org

2018-06-27 14:51:43 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeValueTypeMap.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeValueTypeMap.java.t b/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeValueTypeMap.java.t
deleted file mode 100644
index 16297cc..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/map/AbstractKeyTypeValueTypeMap.java.t
+++ /dev/null
@@ -1,509 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-import java.nio.IntBuffer;
-import java.util.Arrays;
-
-import org.apache.mahout.math.Sorting;
-import org.apache.mahout.math.Swapper;
-import org.apache.mahout.math.set.HashUtils;
-import org.apache.mahout.math.function.${keyTypeCap}${valueTypeCap}Procedure;
-import org.apache.mahout.math.function.${keyTypeCap}Procedure;
-import org.apache.mahout.math.list.${keyTypeCap}ArrayList;
-#if (${keyType} != ${valueType})
-import org.apache.mahout.math.list.${valueTypeCap}ArrayList;
-#end
-import org.apache.mahout.math.function.IntComparator;
-#if (${valueTypeFloating} == 'true')
-import org.apache.mahout.math.function.${valueTypeCap}Function;
-#end
-
-import org.apache.mahout.math.set.AbstractSet;
-
-public abstract class Abstract${keyTypeCap}${valueTypeCap}Map extends AbstractSet {
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- public boolean containsKey(final ${keyType} key) {
- return !forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} iterKey) {
- return key != iterKey;
- }
- }
- );
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified value.
- *
- * @return <tt>true</tt> if the receiver contains the specified value.
- */
- public boolean containsValue(final ${valueType} value) {
- return !forEachPair(
- new ${keyTypeCap}${valueTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} iterKey, ${valueType} iterValue) {
- return (value != iterValue);
- }
- }
- );
- }
-
- /**
- * Returns a deep copy of the receiver; uses <code>clone()</code> and casts the result.
- *
- * @return a deep copy of the receiver.
- */
- public Abstract${keyTypeCap}${valueTypeCap}Map copy() {
- return (Abstract${keyTypeCap}${valueTypeCap}Map) clone();
- }
-
- /**
- * Compares the specified object with this map for equality. Returns <tt>true</tt> if the given object is also a map
- * and the two maps represent the same mappings. More formally, two maps <tt>m1</tt> and <tt>m2</tt> represent the
- * same mappings iff
- * <pre>
- * m1.forEachPair(
- * new ${keyTypeCap}${valueTypeCap}Procedure() {
- * public boolean apply(${keyType} key, ${valueType} value) {
- * return m2.containsKey(key) && m2.get(key) == value;
- * }
- * }
- * )
- * &&
- * m2.forEachPair(
- * new ${keyTypeCap}${valueTypeCap}Procedure() {
- * public boolean apply(${keyType} key, ${valueType} value) {
- * return m1.containsKey(key) && m1.get(key) == value;
- * }
- * }
- * );
- * </pre>
- *
- * This implementation first checks if the specified object is this map; if so it returns <tt>true</tt>. Then, it
- * checks if the specified object is a map whose size is identical to the size of this set; if not, it it returns
- * <tt>false</tt>. If so, it applies the iteration as described above.
- *
- * @param obj object to be compared for equality with this map.
- * @return <tt>true</tt> if the specified object is equal to this map.
- */
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof Abstract${keyTypeCap}${valueTypeCap}Map)) {
- return false;
- }
- final Abstract${keyTypeCap}${valueTypeCap}Map other = (Abstract${keyTypeCap}${valueTypeCap}Map) obj;
- if (other.size() != size()) {
- return false;
- }
-
- return
- forEachPair(
- new ${keyTypeCap}${valueTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key, ${valueType} value) {
- return other.containsKey(key) && other.get(key) == value;
- }
- }
- )
- &&
- other.forEachPair(
- new ${keyTypeCap}${valueTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key, ${valueType} value) {
- return containsKey(key) && get(key) == value;
- }
- }
- );
- }
-
- public int hashCode() {
- final int[] buf = new int[size()];
- forEachPair(
- new ${keyTypeCap}${valueTypeCap}Procedure() {
- int i = 0;
-
- @Override
- public boolean apply(${keyType} key, ${valueType} value) {
- buf[i++] = HashUtils.hash(key) ^ HashUtils.hash(value);
- return true;
- }
- }
- );
- Arrays.sort(buf);
- return IntBuffer.wrap(buf).hashCode();
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- public abstract boolean forEachKey(${keyTypeCap}Procedure procedure);
-
- /**
- * Applies a procedure to each (key,value) pair of the receiver, if any. Iteration order is guaranteed to be
- * identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- public boolean forEachPair(final ${keyTypeCap}${valueTypeCap}Procedure procedure) {
- return forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- return procedure.apply(key, get(key));
- }
- }
- );
- }
-
- /**
- * Returns the value associated with the specified key. It is often a good idea to first check with {@link
- * #containsKey(${keyType})} whether the given key has a value associated or not, i.e. whether there exists an association
- * for the given key or not.
- *
- * @param key the key to be searched for.
- * @return the value associated with the specified key; <tt>0</tt> if no such key is present.
- */
- public abstract ${valueType} get(${keyType} key);
-
- /**
- * Returns a list filled with all keys contained in the receiver. The returned list has a size that equals
- * <tt>this.size()</tt>. Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(${keyTypeCap}Procedure)}. This method can be used to iterate over the keys of the receiver.
- *
- * @return the keys.
- */
- public ${keyTypeCap}ArrayList keys() {
- ${keyTypeCap}ArrayList list = new ${keyTypeCap}ArrayList(size());
- keys(list);
- return list;
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}. This method can be used to
- * iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- public void keys(final ${keyTypeCap}ArrayList list) {
- list.clear();
- forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- list.add(key);
- return true;
- }
- }
- );
- }
-
- /**
- * Fills all keys sorted ascending by their associated value into the specified list. Fills into the list,
- * starting at index 0. After this call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * Primary sort criterium is "value", secondary sort criterium is "key". This means that if any two values are equal,
- * the smaller key comes first. Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList =
- * (8,6,7)</tt>
- *
- * @param keyList the list to be filled, can have any size.
- */
- public void keysSortedByValue(${keyTypeCap}ArrayList keyList) {
- pairsSortedByValue(keyList, new ${valueTypeCap}ArrayList(size()));
- }
-
- /**
- * Fills all pairs satisfying a given condition into the specified lists. Fills into the lists, starting at index 0.
- * After this call returns the specified lists both have a new size, the number of pairs satisfying the condition.
- * Iteration order is guaranteed to be identical to the order used by method
- * {@link #forEachKey(${keyTypeCap}Procedure)}.
- * Example: 
- * <pre>
- * IntIntProcedure condition = new IntIntProcedure() { // match even keys only
- * public boolean apply(int key, int value) { return key%2==0; }
- * }
- * keys = (8,7,6), values = (1,2,2) --> keyList = (6,8), valueList = (2,1)</tt>
- * </pre>
- *
- * @param condition the condition to be matched. Takes the current key as first and the current value as second
- * argument.
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- public void pairsMatching(final ${keyTypeCap}${valueTypeCap}Procedure condition,
- final ${keyTypeCap}ArrayList keyList,
- final ${valueTypeCap}ArrayList valueList) {
- keyList.clear();
- valueList.clear();
-
- forEachPair(
- new ${keyTypeCap}${valueTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key, ${valueType} value) {
- if (condition.apply(key, value)) {
- keyList.add(key);
- valueList.add(value);
- }
- return true;
- }
- }
- );
- }
-
- /**
- * Fills all keys and values sorted ascending by key into the specified lists. Fills into the lists, starting
- * at index 0. After this call returns the specified lists both have a new size that equals <tt>this.size()</tt>. 
- * Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList = (6,7,8), valueList = (2,2,1)</tt>
- *
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- public void pairsSortedByKey(${keyTypeCap}ArrayList keyList, ${valueTypeCap}ArrayList valueList) {
- keys(keyList);
- keyList.sort();
- valueList.setSize(keyList.size());
- for (int i = keyList.size(); --i >= 0;) {
- valueList.setQuick(i, get(keyList.getQuick(i)));
- }
- }
-
- /**
- * Fills all keys and values sorted ascending by value into the specified lists. Fills into the lists, starting
- * at index 0. After this call returns the specified lists both have a new size that equals <tt>this.size()</tt>.
- * Primary sort criterium is "value", secondary sort criterium is "key". This means that if any two values are equal,
- * the smaller key comes first. Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList = (8,6,7),
- * valueList = (1,2,2)</tt>
- *
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- public void pairsSortedByValue(${keyTypeCap}ArrayList keyList, ${valueTypeCap}ArrayList valueList) {
- keys(keyList);
- values(valueList);
-
- final ${keyType}[] k = keyList.elements();
- final ${valueType}[] v = valueList.elements();
- Swapper swapper = new Swapper() {
- @Override
- public void swap(int a, int b) {
- ${valueType} t1 = v[a];
- v[a] = v[b];
- v[b] = t1;
- ${keyType} t2 = k[a];
- k[a] = k[b];
- k[b] = t2;
- }
- };
-
- IntComparator comp = new IntComparator() {
- @Override
- public int compare(int a, int b) {
- return v[a] < v[b] ? -1 : v[a] > v[b] ? 1 : (k[a] < k[b] ? -1 : (k[a] == k[b] ? 0 : 1));
- }
- };
-
- Sorting.quickSort(0, keyList.size(), comp, swapper);
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- public abstract boolean put(${keyType} key, ${valueType} value);
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- public abstract boolean removeKey(${keyType} key);
-
- /**
- * Returns a string representation of the receiver, containing the String representation of each key-value pair,
- * sorted ascending by key.
- */
- public String toString() {
- ${keyTypeCap}ArrayList theKeys = keys();
- //theKeys.sort();
-
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = theKeys.size() - 1;
- for (int i = 0; i <= maxIndex; i++) {
- ${keyType} key = theKeys.get(i);
- buf.append(String.valueOf(key));
- buf.append("->");
- buf.append(String.valueOf(get(key)));
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the receiver, containing the String representation of each key-value pair,
- * sorted ascending by value.
- */
- public String toStringByValue() {
- ${keyTypeCap}ArrayList theKeys = new ${keyTypeCap}ArrayList();
- keysSortedByValue(theKeys);
-
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = theKeys.size() - 1;
- for (int i = 0; i <= maxIndex; i++) {
- ${keyType} key = theKeys.get(i);
- buf.append(String.valueOf(key));
- buf.append("->");
- buf.append(String.valueOf(get(key)));
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a list filled with all values contained in the receiver. The returned list has a size that equals
- * <tt>this.size()</tt>. Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(${keyTypeCap}Procedure)}. This method can be used to iterate over the values of the receiver.
- *
- * @return the values.
- */
- public ${valueTypeCap}ArrayList values() {
- ${valueTypeCap}ArrayList list = new ${valueTypeCap}ArrayList(size());
- values(list);
- return list;
- }
-
- /**
- * Fills all values contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- * This method can be used to
- * iterate over the values of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- public void values(final ${valueTypeCap}ArrayList list) {
- list.clear();
- forEachKey(
- new ${keyTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key) {
- list.add(get(key));
- return true;
- }
- }
- );
- }
-
- #if (${valueTypeFloating} == 'true')
- /**
- * Assigns the result of a function to each value; <tt>v[i] = function(v[i])</tt>.
- *
- * @param function a function object taking as argument the current association's value.
- */
- public void assign(final ${valueTypeCap}Function function) {
- copy().forEachPair(
- new ${keyTypeCap}${valueTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key, ${valueType} value) {
- put(key, function.apply(value));
- return true;
- }
- }
- );
- }
-
- /**
- * Clears the receiver, then adds all (key,value) pairs of <tt>other</tt>values to it.
- *
- * @param other the other map to be copied into the receiver.
- */
- public void assign(Abstract${keyTypeCap}${valueTypeCap}Map other) {
- clear();
- other.forEachPair(
- new ${keyTypeCap}${valueTypeCap}Procedure() {
- @Override
- public boolean apply(${keyType} key, ${valueType} value) {
- put(key, value);
- return true;
- }
- }
- );
- }
- #end
-
- /**
- * Check the map for a key. If present, add an increment to the value. If absent,
- * store a specified value.
- * @param key the key.
- * @param newValue the value to store if the key is not currently in the map.
- * @param incrValue the value to be added to the current value in the map.
- **/
- public ${valueType} adjustOrPutValue(${keyType} key, ${valueType} newValue, ${valueType} incrValue) {
- boolean present = containsKey(key);
- if (present) {
- newValue = (${valueType})(get(key) + incrValue);
- put(key, newValue);
- } else {
- put(key, newValue);
- }
- return newValue;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/map/AbstractObjectValueTypeMap.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/map/AbstractObjectValueTypeMap.java.t b/math/src/main/java-templates/org/apache/mahout/math/map/AbstractObjectValueTypeMap.java.t
deleted file mode 100644
index 15778be..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/map/AbstractObjectValueTypeMap.java.t
+++ /dev/null
@@ -1,516 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.nio.IntBuffer;
-import java.util.Arrays;
-
-import org.apache.mahout.math.set.HashUtils;
-import org.apache.mahout.math.Sorting;
-import org.apache.mahout.math.Swapper;
-import org.apache.mahout.math.function.Object${valueTypeCap}Procedure;
-import org.apache.mahout.math.function.ObjectProcedure;
-import org.apache.mahout.math.list.${valueTypeCap}ArrayList;
-import org.apache.mahout.math.function.IntComparator;
-#if (${valueTypeFloating} == 'true')
-import org.apache.mahout.math.function.${valueTypeCap}Function;
-#end
-import org.apache.mahout.math.set.AbstractSet;
-
-public abstract class AbstractObject${valueTypeCap}Map<T> extends AbstractSet {
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- public boolean containsKey(final T key) {
- return !forEachKey(
- new ObjectProcedure<T>() {
- @Override
- public boolean apply(T iterKey) {
- return (key != iterKey);
- }
- }
- );
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified value.
- *
- * @return <tt>true</tt> if the receiver contains the specified value.
- */
- public boolean containsValue(final ${valueType} value) {
- return !forEachPair(
- new Object${valueTypeCap}Procedure<T>() {
- @Override
- public boolean apply(T iterKey, ${valueType} iterValue) {
- return (value != iterValue);
- }
- }
- );
- }
-
- /**
- * Returns a deep copy of the receiver; uses <code>clone()</code> and casts the result.
- *
- * @return a deep copy of the receiver.
- */
- @SuppressWarnings("unchecked") // seemingly unavoidable.
- public AbstractObject${valueTypeCap}Map<T> copy() {
- return (AbstractObject${valueTypeCap}Map<T>) clone();
- }
-
- /**
- * Compares the specified object with this map for equality. Returns <tt>true</tt> if the given object is also a map
- * and the two maps represent the same mappings. More formally, two maps <tt>m1</tt> and <tt>m2</tt> represent the
- * same mappings iff
- * <pre>
- * m1.forEachPair(
- * new Object${valueTypeCap}Procedure<T>() {
- * public boolean apply(T key, ${valueType} value) {
- * return m2.containsKey(key) && m2.get(key) == value;
- * }
- * }
- * )
- * &&
- * m2.forEachPair(
- * new Object${valueTypeCap}Procedure<T>() {
- * public boolean apply(T key, ${valueType} value) {
- * return m1.containsKey(key) && m1.get(key) == value;
- * }
- * }
- * );
- * </pre>
- *
- * This implementation first checks if the specified object is this map; if so it returns <tt>true</tt>. Then, it
- * checks if the specified object is a map whose size is identical to the size of this set; if not, it it returns
- * <tt>false</tt>. If so, it applies the iteration as described above.
- *
- * @param obj object to be compared for equality with this map.
- * @return <tt>true</tt> if the specified object is equal to this map.
- */
- @SuppressWarnings("unchecked")
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof AbstractObject${valueTypeCap}Map)) {
- return false;
- }
- final AbstractObject${valueTypeCap}Map other = (AbstractObject${valueTypeCap}Map) obj;
- if (other.size() != size()) {
- return false;
- }
-
- return
- forEachPair(
- new Object${valueTypeCap}Procedure<T>() {
- @Override
- public boolean apply(T key, ${valueType} value) {
- return other.containsKey(key) && other.get(key) == value;
- }
- }
- )
- &&
- other.forEachPair(
- new Object${valueTypeCap}Procedure<T>() {
- @Override
- public boolean apply(T key, ${valueType} value) {
- return containsKey(key) && get(key) == value;
- }
- }
- );
- }
-
- public int hashCode() {
- final int[] buf = new int[size()];
- forEachPair(
- new Object${valueTypeCap}Procedure<T>() {
- int i = 0;
-
- @Override
- public boolean apply(Object key, ${valueType} value) {
- buf[i++] = key.hashCode() ^ HashUtils.hash(value);
- return true;
- }
- }
- );
- Arrays.sort(buf);
- return IntBuffer.wrap(buf).hashCode();
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- public abstract boolean forEachKey(ObjectProcedure<T> procedure);
-
- /**
- * Applies a procedure to each (key,value) pair of the receiver, if any. Iteration order is guaranteed to be
- * identical to the order used by method {@link #forEachKey(ObjectProcedure)}.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- public boolean forEachPair(final Object${valueTypeCap}Procedure<T> procedure) {
- return forEachKey(
- new ObjectProcedure<T>() {
- @Override
- public boolean apply(T key) {
- return procedure.apply(key, get(key));
- }
- }
- );
- }
-
- /**
- * Returns the value associated with the specified key. It is often a good idea to first check with {@link
- * #containsKey(Object)} whether the given key has a value associated or not, i.e. whether there exists an association
- * for the given key or not.
- *
- * @param key the key to be searched for.
- * @return the value associated with the specified key; <tt>0</tt> if no such key is present.
- */
- public abstract ${valueType} get(T key);
-
- /**
- * Returns a list filled with all keys contained in the receiver. The returned list has a size that equals
- * <tt>this.size()</tt>. Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(ObjectProcedure)}. This method can be used to iterate over the keys of the receiver.
- *
- * @return the keys.
- */
- public List<T> keys() {
- List<T> list = new ArrayList<T>(size());
- keys(list);
- return list;
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(ObjectProcedure)}. This method can be used to
- * iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- public void keys(final List<T> list) {
- list.clear();
- forEachKey(
- new ObjectProcedure<T>() {
- @Override
- public boolean apply(T key) {
- list.add(key);
- return true;
- }
- }
- );
- }
-
- /**
- * Fills all keys sorted ascending by their associated value into the specified list. Fills into the list,
- * starting at index 0. After this call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * Primary sort criterium is "value", secondary sort criterium is "key". This means that if any two values are equal,
- * the smaller key comes first. Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList =
- * (8,6,7)</tt>
- *
- * @param keyList the list to be filled, can have any size.
- */
- public void keysSortedByValue(List<T> keyList) {
- pairsSortedByValue(keyList, new ${valueTypeCap}ArrayList(size()));
- }
-
- /**
- * Fills all pairs satisfying a given condition into the specified lists. Fills into the lists, starting at index 0.
- * After this call returns the specified lists both have a new size, the number of pairs satisfying the condition.
- * Iteration order is guaranteed to be identical to the order used by method
- * {@link #forEachKey(ObjectProcedure)}.
- * Example: 
- * <pre>
- * IntIntProcedure condition = new IntIntProcedure() { // match even keys only
- * public boolean apply(int key, int value) { return key%2==0; }
- * }
- * keys = (8,7,6), values = (1,2,2) --> keyList = (6,8), valueList = (2,1)</tt>
- * </pre>
- *
- * @param condition the condition to be matched. Takes the current key as first and the current value as second
- * argument.
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- public void pairsMatching(final Object${valueTypeCap}Procedure<T> condition,
- final List<T> keyList,
- final ${valueTypeCap}ArrayList valueList) {
- keyList.clear();
- valueList.clear();
-
- forEachPair(
- new Object${valueTypeCap}Procedure<T>() {
- @Override
- public boolean apply(T key, ${valueType} value) {
- if (condition.apply(key, value)) {
- keyList.add(key);
- valueList.add(value);
- }
- return true;
- }
- }
- );
- }
-
- /**
- * Fills all keys and values sorted ascending by key into the specified lists. Fills into the lists, starting
- * at index 0. After this call returns the specified lists both have a new size that equals <tt>this.size()</tt>. 
- * Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList = (6,7,8), valueList = (2,2,1)</tt>
- *
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- @SuppressWarnings("unchecked")
- public void pairsSortedByKey(List<T> keyList, ${valueTypeCap}ArrayList valueList) {
- keys(keyList);
- if (keyList.isEmpty()) {
- return;
- }
- T k = keyList.get(0);
- // some people would demand a more complex type hierarchy here ...
- if (!(k instanceof Comparable)) {
- throw new UnsupportedOperationException("The key type for this map does not implement comparable");
- }
- Collections.sort((List) keyList);
- valueList.setSize(keyList.size());
- for (int i = keyList.size(); --i >= 0;) {
- valueList.setQuick(i, get(keyList.get(i)));
- }
- }
-
- /**
- * Fills all keys and values sorted ascending by value into the specified lists. Fills into the lists, starting
- * at index 0. After this call returns the specified lists both have a new size that equals <tt>this.size()</tt>.
- * Primary sort criterium is "value", secondary sort criterium is "key". This means that if any two values are equal,
- * the smaller key comes first. Example: <tt>keys = (8,7,6), values = (1,2,2) --> keyList = (8,6,7),
- * valueList = (1,2,2)</tt>
- *
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- public void pairsSortedByValue(final List<T> keyList, ${valueTypeCap}ArrayList valueList) {
- keys(keyList);
- values(valueList);
-
- final ${valueType}[] v = valueList.elements();
- Swapper swapper = new Swapper() {
- @Override
- public void swap(int a, int b) {
- ${valueType} t1 = v[a];
- v[a] = v[b];
- v[b] = t1;
- T t2 = keyList.get(a);
- keyList.set(a, keyList.get(b));
- keyList.set(b, t2);
- }
- };
-
- IntComparator comp = new IntComparator() {
- @Override
- public int compare(int a, int b) {
- return v[a] < v[b] ? -1 : v[a] > v[b] ? 1 : 0;
- }
- };
-
- Sorting.quickSort(0, keyList.size(), comp, swapper);
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- public abstract boolean put(T key, ${valueType} value);
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- public abstract boolean removeKey(T key);
-
- /**
- * Returns a string representation of the receiver, containing the String representation of each key-value pair,
- * sorted ascending by key.
- */
- public String toString() {
- List<T> theKeys = keys();
-
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = theKeys.size() - 1;
- for (int i = 0; i <= maxIndex; i++) {
- T key = theKeys.get(i);
- buf.append(String.valueOf(key));
- buf.append("->");
- buf.append(String.valueOf(get(key)));
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a string representation of the receiver, containing the String representation of each key-value pair,
- * sorted ascending by value.
- */
- public String toStringByValue() {
- List<T> theKeys = new ArrayList<T>();
- keysSortedByValue(theKeys);
-
- StringBuilder buf = new StringBuilder();
- buf.append('[');
- int maxIndex = theKeys.size() - 1;
- for (int i = 0; i <= maxIndex; i++) {
- T key = theKeys.get(i);
- buf.append(String.valueOf(key));
- buf.append("->");
- buf.append(String.valueOf(get(key)));
- if (i < maxIndex) {
- buf.append(", ");
- }
- }
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Returns a list filled with all values contained in the receiver. The returned list has a size that equals
- * <tt>this.size()</tt>. Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(ObjectProcedure)}. This method can be used to iterate over the values of the receiver.
- *
- * @return the values.
- */
- public ${valueTypeCap}ArrayList values() {
- ${valueTypeCap}ArrayList list = new ${valueTypeCap}ArrayList(size());
- values(list);
- return list;
- }
-
- /**
- * Fills all values contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(ObjectProcedure)}. This method can be used to
- * iterate over the values of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- public void values(final ${valueTypeCap}ArrayList list) {
- list.clear();
- forEachKey(
- new ObjectProcedure<T>() {
- @Override
- public boolean apply(T key) {
- list.add(get(key));
- return true;
- }
- }
- );
- }
-
- #if (${valueTypeFloating} == 'true')
- /**
- * Assigns the result of a function to each value; <tt>v[i] = function(v[i])</tt>.
- *
- * @param function a function object taking as argument the current association's value.
- */
- public void assign(final ${valueTypeCap}Function function) {
- copy().forEachPair(
- new Object${valueTypeCap}Procedure<T>() {
- @Override
- public boolean apply(T key, ${valueType} value) {
- put(key, function.apply(value));
- return true;
- }
- }
- );
- }
-
- /**
- * Clears the receiver, then adds all (key,value) pairs of <tt>other</tt>values to it.
- *
- * @param other the other map to be copied into the receiver.
- */
- public void assign(AbstractObject${valueTypeCap}Map<T> other) {
- clear();
- other.forEachPair(
- new Object${valueTypeCap}Procedure<T>() {
- @Override
- public boolean apply(T key, ${valueType} value) {
- put(key, value);
- return true;
- }
- }
- );
- }
- #end
-
- /**
- * Check the map for a key. If present, add an increment to the value. If absent,
- * store a specified value.
- * @param key the key.
- * @param newValue the value to store if the key is not currently in the map.
- * @param incrValue the value to be added to the current value in the map.
- **/
- public ${valueType} adjustOrPutValue(T key, ${valueType} newValue, ${valueType} incrValue) {
- boolean present = containsKey(key);
- if (present) {
- newValue = (${valueType})(get(key) + incrValue);
- put(key, newValue);
- } else {
- put(key, newValue);
- }
- return newValue;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeObjectHashMap.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeObjectHashMap.java.t b/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeObjectHashMap.java.t
deleted file mode 100644
index 18ff8da..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeObjectHashMap.java.t
+++ /dev/null
@@ -1,548 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.mahout.math.function.${keyTypeCap}ObjectProcedure;
-import org.apache.mahout.math.function.${keyTypeCap}Procedure;
-import org.apache.mahout.math.list.${keyTypeCap}ArrayList;
-
-public class Open${keyTypeCap}ObjectHashMap<T> extends Abstract${keyTypeCap}ObjectMap<T> {
-
- private static final byte FREE = 0;
- private static final byte FULL = 1;
- private static final byte REMOVED = 2;
-
- /** The hash table keys. */
- private ${keyType}[] table;
-
- /** The hash table values. */
- private T[] values;
-
- /** The state of each hash table entry (FREE, FULL, REMOVED). */
- private byte[] state;
-
- /** The number of table entries in state==FREE. */
- private int freeEntries;
-
- /** Constructs an empty map with default capacity and default load factors. */
- public Open${keyTypeCap}ObjectHashMap() {
- this(DEFAULT_CAPACITY);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and default load factors.
- *
- * @param initialCapacity the initial capacity of the map.
- * @throws IllegalArgumentException if the initial capacity is less than zero.
- */
- public Open${keyTypeCap}ObjectHashMap(int initialCapacity) {
- this(initialCapacity, DEFAULT_MIN_LOAD_FACTOR, DEFAULT_MAX_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
- *
- * @param initialCapacity the initial capacity.
- * @param minLoadFactor the minimum load factor.
- * @param maxLoadFactor the maximum load factor.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- public Open${keyTypeCap}ObjectHashMap(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- setUp(initialCapacity, minLoadFactor, maxLoadFactor);
- }
-
- /** Removes all (key,value) associations from the receiver. Implicitly calls <tt>trimToSize()</tt>. */
- @Override
- public void clear() {
- Arrays.fill(state, FREE);
- Arrays.fill(values, null); // delta
-
- this.distinct = 0;
- this.freeEntries = table.length; // delta
- trimToSize();
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @SuppressWarnings("unchecked")
- @Override
- public Open${keyTypeCap}ObjectHashMap<T> clone() {
- Open${keyTypeCap}ObjectHashMap<T> copy = (Open${keyTypeCap}ObjectHashMap<T>) super.clone();
- copy.table = copy.table.clone();
- copy.values = copy.values.clone();
- copy.state = copy.state.clone();
- return copy;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- @Override
- public boolean containsKey(${keyType} key) {
- return indexOfKey(key) >= 0;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified value.
- *
- * @return <tt>true</tt> if the receiver contains the specified value.
- */
- @Override
- public boolean containsValue(T value) {
- return indexOfValue(value) >= 0;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
- * method never need be called; it is for performance tuning only. Calling this method before <tt>put()</tt>ing a
- * large number of associations boosts performance, because the receiver will grow only once instead of potentially
- * many times and hash collisions get less probable.
- *
- * @param minCapacity the desired minimum capacity.
- */
- @Override
- public void ensureCapacity(int minCapacity) {
- if (table.length < minCapacity) {
- int newCapacity = nextPrime(minCapacity);
- rehash(newCapacity);
- }
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @Override
- public boolean forEachKey(${keyTypeCap}Procedure procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply(table[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Applies a procedure to each (key,value) pair of the receiver, if any. Iteration order is guaranteed to be
- * identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @Override
- public boolean forEachPair(${keyTypeCap}ObjectProcedure<T> procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply(table[i], values[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the value associated with the specified key. It is often a good idea to first check with
- * containsKey(${keyType}) whether the given key has a value associated or not, i.e. whether there exists an association
- * for the given key or not.
- *
- * @param key the key to be searched for.
- * @return the value associated with the specified key; <tt>null</tt> if no such key is present.
- */
- @Override
- public T get(${keyType} key) {
- final int i = indexOfKey(key);
- if (i < 0) {
- return null;
- } //not contained
- return values[i];
- }
-
- /**
- * @param key the key to be added to the receiver.
- * @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
- * key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
- * at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
- * slot index.
- */
- protected int indexOfInsertion(${keyType} key) {
- final int length = table.length;
-
- final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a removed or free slot, or if we find the key itself
- // do NOT skip over removed slots (yes, open addressing is like that...)
- while (state[i] == FULL && table[i] != key) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == REMOVED) {
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- // assertion: there is at least one FREE slot.
- final int j = i;
- while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
- if (state[i] == FREE) {
- i = j;
- }
- }
-
-
- if (state[i] == FULL) {
- // key already contained at slot i.
- // return a negative number identifying the slot.
- return -i - 1;
- }
- // not already contained, should be inserted at slot i.
- // return a number >= 0 identifying the slot.
- return i;
- }
-
- /**
- * @param key the key to be searched in the receiver.
- * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
- */
- protected int indexOfKey(${keyType} key) {
- final int length = table.length;
-
- final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == FREE) {
- return -1;
- } // not found
- return i; //found, return index where key is contained
- }
-
- /**
- * @param value the value to be searched in the receiver.
- * @return the index where the value is contained in the receiver, returns -1 if the value was not found.
- */
- protected int indexOfValue(T value) {
- T[] val = values;
- byte[] stat = state;
-
- for (int i = stat.length; --i >= 0;) {
- if (stat[i] == FULL && val[i] == value) {
- return i;
- }
- }
-
- return -1; // not found
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}. This method can be used to
- * iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @Override
- public void keys(${keyTypeCap}ArrayList list) {
- list.setSize(distinct);
- ${keyType}[] elements = list.elements();
-
- int j = 0;
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL) {
- elements[j++] = table[i];
- }
- }
- }
-
- /**
- * Fills all pairs satisfying a given condition into the specified lists. Fills into the lists, starting at index 0.
- * After this call returns the specified lists both have a new size, the number of pairs satisfying the condition.
- * Iteration order is guaranteed to be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- * Example: 
- * <pre>
- * ${keyTypeCap}ObjectProcedure condition = new ${keyTypeCap}ObjectProcedure() { // match even keys only
- * public boolean apply(${keyType} key, Object value) { return key%2==0; }
- * }
- * keys = (8,7,6), values = (1,2,2) --> keyList = (6,8), valueList = (2,1)</tt>
- * </pre>
- *
- * @param condition the condition to be matched. Takes the current key as first and the current value as second
- * argument.
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- @Override
- public void pairsMatching(${keyTypeCap}ObjectProcedure<T> condition,
- ${keyTypeCap}ArrayList keyList,
- List<T> valueList) {
- keyList.clear();
- valueList.clear();
-
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && condition.apply(table[i], values[i])) {
- keyList.add(table[i]);
- valueList.add(values[i]);
- }
- }
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- @Override
- public boolean put(${keyType} key, T value) {
- int i = indexOfInsertion(key);
- if (i < 0) { //already contained
- i = -i - 1;
- this.values[i] = value;
- return false;
- }
-
- if (this.distinct > this.highWaterMark) {
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return put(key, value);
- }
-
- this.table[i] = key;
- this.values[i] = value;
- if (this.state[i] == FREE) {
- this.freeEntries--;
- }
- this.state[i] = FULL;
- this.distinct++;
-
- if (this.freeEntries < 1) { //delta
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
- * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
- * mark.
- */
- @SuppressWarnings("unchecked")
- protected void rehash(int newCapacity) {
- int oldCapacity = table.length;
- //if (oldCapacity == newCapacity) return;
-
- ${keyType}[] oldTable = table;
- T[] oldValues = values;
- byte[] oldState = state;
-
- this.table = new ${keyType}[newCapacity];
- this.values = (T[]) new Object[newCapacity];
- this.state = new byte[newCapacity];
-
- this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
- this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
-
- this.freeEntries = newCapacity - this.distinct; // delta
-
- for (int i = oldCapacity; i-- > 0;) {
- if (oldState[i] == FULL) {
- ${keyType} element = oldTable[i];
- int index = indexOfInsertion(element);
- this.table[index] = element;
- this.values[index] = oldValues[i];
- this.state[index] = FULL;
- }
- }
- }
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- @Override
- public boolean removeKey(${keyType} key) {
- int i = indexOfKey(key);
- if (i < 0) {
- return false;
- } // key not contained
-
- this.state[i] = REMOVED;
- this.values[i] = null; // delta
- this.distinct--;
-
- if (this.distinct < this.lowWaterMark) {
- int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Initializes the receiver.
- *
- * @param initialCapacity the initial capacity of the receiver.
- * @param minLoadFactor the minLoadFactor of the receiver.
- * @param maxLoadFactor the maxLoadFactor of the receiver.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- @SuppressWarnings("unchecked")
- @Override
- final protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- int capacity = initialCapacity;
- super.setUp(capacity, minLoadFactor, maxLoadFactor);
- capacity = nextPrime(capacity);
- if (capacity == 0) {
- capacity = 1;
- } // open addressing needs at least one FREE slot at any time.
-
- this.table = new ${keyType}[capacity];
- this.values = (T[]) new Object[capacity];
- this.state = new byte[capacity];
-
- // memory will be exhausted long before this pathological case happens, anyway.
- this.minLoadFactor = minLoadFactor;
- if (capacity == PrimeFinder.LARGEST_PRIME) {
- this.maxLoadFactor = 1.0;
- } else {
- this.maxLoadFactor = maxLoadFactor;
- }
-
- this.distinct = 0;
- this.freeEntries = capacity; // delta
-
- // lowWaterMark will be established upon first expansion.
- // establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
- // After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
- // See ensureCapacity(...)
- this.lowWaterMark = 0;
- this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- // * 1.2 because open addressing's performance exponentially degrades beyond that point
- // so that even rehashing the table can take very long
- int newCapacity = nextPrime((int) (1 + 1.2 * size()));
- if (table.length > newCapacity) {
- rehash(newCapacity);
- }
- }
-
- /**
- * Fills all values contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>.
- * This method can be used to
- * iterate over the values of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @Override
- public void values(List<T> list) {
- list.clear();
-
- for (int i = state.length; i-- > 0;) {
- if (state[i] == FULL) {
- list.add(values[i]);
- }
- }
- }
-
- /**
- * Access for unit tests.
- * @param capacity
- * @param minLoadFactor
- * @param maxLoadFactor
- */
- protected void getInternalFactors(int[] capacity,
- double[] minLoadFactor,
- double[] maxLoadFactor) {
- capacity[0] = table.length;
- minLoadFactor[0] = this.minLoadFactor;
- maxLoadFactor[0] = this.maxLoadFactor;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeValueTypeHashMap.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeValueTypeHashMap.java.t b/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeValueTypeHashMap.java.t
deleted file mode 100644
index 57d442e..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/map/OpenKeyTypeValueTypeHashMap.java.t
+++ /dev/null
@@ -1,632 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-Copyright � 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.map;
-
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.NoSuchElementException;
-
-import org.apache.mahout.math.function.${keyTypeCap}${valueTypeCap}Procedure;
-import org.apache.mahout.math.function.${keyTypeCap}Procedure;
-import org.apache.mahout.math.list.${keyTypeCap}ArrayList;
-
-#if (${keyType} != ${valueType})
-import org.apache.mahout.math.list.${valueTypeCap}ArrayList;
-#end
-
-
-/**
- * Open hash map from ${keyType} keys to ${valueType} values.
- **/
-public class Open${keyTypeCap}${valueTypeCap}HashMap extends Abstract${keyTypeCap}${valueTypeCap}Map {
- protected static final byte FREE = 0;
- protected static final byte FULL = 1;
- protected static final byte REMOVED = 2;
-#if (${keyTypeFloating} == 'true')
-#set ($noKeyComment = "${keyTypeCap}.NaN")
- protected static final ${keyType} NO_KEY_VALUE = ${keyTypeCap}.NaN;
-#else
-#set ($noKeyComment = "0")
- protected static final ${keyType} NO_KEY_VALUE = 0;
-#end
-
- /** The hash table keys. */
- protected ${keyType}[] table;
-
- /** The hash table values. */
- protected ${valueType}[] values;
-
- /** The state of each hash table entry (FREE, FULL, REMOVED). */
- protected byte[] state;
-
- /** The number of table entries in state==FREE. */
- protected int freeEntries;
-
-
- /** Constructs an empty map with default capacity and default load factors. */
- public Open${keyTypeCap}${valueTypeCap}HashMap() {
- this(DEFAULT_CAPACITY);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and default load factors.
- *
- * @param initialCapacity the initial capacity of the map.
- * @throws IllegalArgumentException if the initial capacity is less than zero.
- */
- public Open${keyTypeCap}${valueTypeCap}HashMap(int initialCapacity) {
- this(initialCapacity, DEFAULT_MIN_LOAD_FACTOR, DEFAULT_MAX_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
- *
- * @param initialCapacity the initial capacity.
- * @param minLoadFactor the minimum load factor.
- * @param maxLoadFactor the maximum load factor.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- public Open${keyTypeCap}${valueTypeCap}HashMap(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- setUp(initialCapacity, minLoadFactor, maxLoadFactor);
- }
-
- /** Removes all (key,value) associations from the receiver. Implicitly calls <tt>trimToSize()</tt>. */
- @Override
- public void clear() {
- Arrays.fill(this.state, FREE);
- distinct = 0;
- freeEntries = table.length; // delta
- trimToSize();
- }
-
- /**
- * Returns a deep copy of the receiver.
- *
- * @return a deep copy of the receiver.
- */
- @Override
- public Object clone() {
- Open${keyTypeCap}${valueTypeCap}HashMap copy = (Open${keyTypeCap}${valueTypeCap}HashMap) super.clone();
- copy.table = copy.table.clone();
- copy.values = copy.values.clone();
- copy.state = copy.state.clone();
- return copy;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified key.
- *
- * @return <tt>true</tt> if the receiver contains the specified key.
- */
- @Override
- public boolean containsKey(${keyType} key) {
- return indexOfKey(key) >= 0;
- }
-
- /**
- * Returns <tt>true</tt> if the receiver contains the specified value.
- *
- * @return <tt>true</tt> if the receiver contains the specified value.
- */
- @Override
- public boolean containsValue(${valueType} value) {
- return indexOfValue(value) >= 0;
- }
-
- /**
- * Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
- * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
- * method never need be called; it is for performance tuning only. Calling this method before <tt>put()</tt>ing a
- * large number of associations boosts performance, because the receiver will grow only once instead of potentially
- * many times and hash collisions get less probable.
- *
- * @param minCapacity the desired minimum capacity.
- */
- @Override
- public void ensureCapacity(int minCapacity) {
- if (table.length < minCapacity) {
- int newCapacity = nextPrime(minCapacity);
- rehash(newCapacity);
- }
- }
-
- /**
- * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
- * Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
- * in terms of this method (most methods can) must guarantee to use the same order defined by this
- * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
- * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @Override
- public boolean forEachKey(${keyTypeCap}Procedure procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply(table[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Applies a procedure to each (key,value) pair of the receiver, if any. Iteration order is guaranteed to be
- * identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- *
- * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
- * continues.
- * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
- */
- @Override
- public boolean forEachPair(${keyTypeCap}${valueTypeCap}Procedure procedure) {
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && !procedure.apply(table[i], values[i])) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the value associated with the specified key. It is often a good idea to first check with
- * containsKey(${keyType}) whether the given key has a value associated or not, i.e. whether there exists an association
- * for the given key or not.
- *
- * @param key the key to be searched for.
- * @return the value associated with the specified key; <tt>0</tt> if no such key is present.
- */
- @Override
- public ${valueType} get(${keyType} key) {
- final int i = indexOfKey(key);
- if (i < 0) {
- return 0;
- } //not contained
- return values[i];
- }
-
- /**
- * @param key the key to be added to the receiver.
- * @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
- * key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
- * at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
- * slot index.
- */
- protected int indexOfInsertion(${keyType} key) {
- final int length = table.length;
-
- final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a removed or free slot, or if we find the key itself
- // do NOT skip over removed slots (yes, open addressing is like that...)
- while (state[i] == FULL && table[i] != key) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == REMOVED) {
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- // assertion: there is at least one FREE slot.
- final int j = i;
- while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
- if (state[i] == FREE) {
- i = j;
- }
- }
-
-
- if (state[i] == FULL) {
- // key already contained at slot i.
- // return a negative number identifying the slot.
- return -i - 1;
- }
- // not already contained, should be inserted at slot i.
- // return a number >= 0 identifying the slot.
- return i;
- }
-
- /**
- * @param key the key to be searched in the receiver.
- * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
- */
- protected int indexOfKey(${keyType} key) {
- final int length = table.length;
-
- final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
- int i = hash % length;
- int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
- //int decrement = (hash / length) % length;
- if (decrement == 0) {
- decrement = 1;
- }
-
- // stop if we find a free slot, or if we find the key itself.
- // do skip over removed slots (yes, open addressing is like that...)
- while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
- i -= decrement;
- //hashCollisions++;
- if (i < 0) {
- i += length;
- }
- }
-
- if (state[i] == FREE) {
- return -1;
- } // not found
- return i; //found, return index where key is contained
- }
-
- /**
- * @param value the value to be searched in the receiver.
- * @return the index where the value is contained in the receiver, returns -1 if the value was not found.
- */
- protected int indexOfValue(${valueType} value) {
- ${valueType}[] val = values;
- byte[] stat = state;
-
- for (int i = stat.length; --i >= 0;) {
- if (stat[i] == FULL && val[i] == value) {
- return i;
- }
- }
-
- return -1; // not found
- }
-
- /**
- * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- * This method can be used
- * to iterate over the keys of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @Override
- public void keys(${keyTypeCap}ArrayList list) {
- list.setSize(distinct);
- ${keyType} [] elements = list.elements();
-
- int j = 0;
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL) {
- elements[j++] = table[i];
- }
- }
- }
-
- public Iterator<MapElement> iterator() {
- return new MapIterator();
- }
-
- public final class MapElement {
- private int offset = -1;
- int seen = 0;
-
- boolean advanceOffset() {
- offset++;
- while (offset < state.length && state[offset] != FULL) {
- offset++;
- }
- if (offset < state.length) {
- seen++;
- }
- return offset < state.length;
- }
-
- public ${valueType} get() {
- return values[offset];
- }
-
- public ${keyType} index() {
- return table[offset];
- }
-
- public void set(${valueType} value) {
- values[offset] = value;
- }
- }
-
- public final class MapIterator implements Iterator<MapElement> {
- private final MapElement element = new MapElement();
-
- private MapIterator() { }
-
- @Override
- public boolean hasNext() {
- return element.seen < distinct;
- }
-
- @Override
- public MapElement next() {
- if (element.advanceOffset()) {
- return element;
- }
- throw new NoSuchElementException();
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- /**
- * Fills all pairs satisfying a given condition into the specified lists. Fills into the lists, starting at index 0.
- * After this call returns the specified lists both have a new size, the number of pairs satisfying the condition.
- * Iteration order is guaranteed to be identical to the order used by method {@link
- * #forEachKey(${keyTypeCap}Procedure)}. Example: 
- * <pre>
- * ${keyTypeCap}${valueTypeCap}Procedure condition = new ${keyTypeCap}${valueTypeCap}Procedure() { // match even values only
- * public boolean apply(${keyType} key, ${valueType} value) { return value%2==0; }
- * }
- * keys = (8,7,6), values = (1,2,2) --> keyList = (6,8), valueList = (2,1)</tt>
- * </pre>
- *
- * @param condition the condition to be matched. Takes the current key as first and the current value as second
- * argument.
- * @param keyList the list to be filled with keys, can have any size.
- * @param valueList the list to be filled with values, can have any size.
- */
- @Override
- public void pairsMatching(${keyTypeCap}${valueTypeCap}Procedure condition,
- ${keyTypeCap}ArrayList keyList,
- ${valueTypeCap}ArrayList valueList) {
- keyList.clear();
- valueList.clear();
-
- for (int i = table.length; i-- > 0;) {
- if (state[i] == FULL && condition.apply(table[i], values[i])) {
- keyList.add(table[i]);
- valueList.add(values[i]);
- }
- }
- }
-
- /**
- * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
- * existing.
- *
- * @param key the key the value shall be associated with.
- * @param value the value to be associated.
- * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
- * already contain such a key - the new value has now replaced the formerly associated value.
- */
- @Override
- public boolean put(${keyType} key, ${valueType} value) {
- int i = indexOfInsertion(key);
- if (i < 0) { // already contained
- i = -i - 1;
- this.values[i] = value;
- return false;
- }
-
- if (this.distinct > this.highWaterMark) {
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- return put(key, value);
- }
-
- this.table[i] = key;
- this.values[i] = value;
- if (this.state[i] == FREE) {
- this.freeEntries--;
- }
- this.state[i] = FULL;
- this.distinct++;
-
- if (this.freeEntries < 1) { //delta
- int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- @Override
- public ${valueType} adjustOrPutValue(${keyType} key, ${valueType} newValue, ${valueType} incrValue) {
- int i = indexOfInsertion(key);
- if (i < 0) { //already contained
- i = -i - 1;
- this.values[i] += incrValue;
- return this.values[i];
- } else {
- put(key, newValue);
- return newValue;
- }
- }
-
- /**
- * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
- * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
- * mark.
- */
- protected void rehash(int newCapacity) {
- int oldCapacity = table.length;
- //if (oldCapacity == newCapacity) return;
-
- ${keyType}[] oldTable = table;
- ${valueType}[] oldValues = values;
- byte[] oldState = state;
-
- this.table = new ${keyType}[newCapacity];
- this.values = new ${valueType}[newCapacity];
- this.state = new byte[newCapacity];
-
- this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
- this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
-
- this.freeEntries = newCapacity - this.distinct; // delta
-
- for (int i = oldCapacity; i-- > 0;) {
- if (oldState[i] == FULL) {
- ${keyType} element = oldTable[i];
- int index = indexOfInsertion(element);
- this.table[index] = element;
- this.values[index] = oldValues[i];
- this.state[index] = FULL;
- }
- }
- }
-
- /**
- * Removes the given key with its associated element from the receiver, if present.
- *
- * @param key the key to be removed from the receiver.
- * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
- */
- @Override
- public boolean removeKey(${keyType} key) {
- int i = indexOfKey(key);
- if (i < 0) {
- return false;
- } // key not contained
-
- this.state[i] = REMOVED;
- //this.values[i]=0; // delta
- this.distinct--;
-
- if (this.distinct < this.lowWaterMark) {
- int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
- rehash(newCapacity);
- }
-
- return true;
- }
-
- /**
- * Initializes the receiver.
- *
- * @param initialCapacity the initial capacity of the receiver.
- * @param minLoadFactor the minLoadFactor of the receiver.
- * @param maxLoadFactor the maxLoadFactor of the receiver.
- * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
- * (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
- * maxLoadFactor)</tt>.
- */
- @Override
- final protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
- int capacity = initialCapacity;
- super.setUp(capacity, minLoadFactor, maxLoadFactor);
- capacity = nextPrime(capacity);
- if (capacity == 0) {
- capacity = 1;
- } // open addressing needs at least one FREE slot at any time.
-
- this.table = new ${keyType}[capacity];
- this.values = new ${valueType}[capacity];
- this.state = new byte[capacity];
-
- // memory will be exhausted long before this pathological case happens, anyway.
- this.minLoadFactor = minLoadFactor;
- if (capacity == PrimeFinder.LARGEST_PRIME) {
- this.maxLoadFactor = 1.0;
- } else {
- this.maxLoadFactor = maxLoadFactor;
- }
-
- this.distinct = 0;
- this.freeEntries = capacity; // delta
-
- // lowWaterMark will be established upon first expansion.
- // establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
- // After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
- // See ensureCapacity(...)
- this.lowWaterMark = 0;
- this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
- }
-
- /**
- * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
- * application can use this operation to minimize the storage of the receiver.
- */
- @Override
- public void trimToSize() {
- // * 1.2 because open addressing's performance exponentially degrades beyond that point
- // so that even rehashing the table can take very long
- int newCapacity = nextPrime((int) (1 + 1.2 * size()));
- if (table.length > newCapacity) {
- rehash(newCapacity);
- }
- }
-
- /**
- * Fills all values contained in the receiver into the specified list. Fills the list, starting at index 0. After this
- * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
- * be identical to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
- * This method can be used
- * to iterate over the values of the receiver.
- *
- * @param list the list to be filled, can have any size.
- */
- @Override
- public void values(${valueTypeCap}ArrayList list) {
- list.setSize(distinct);
- ${valueType}[] elements = list.elements();
-
- int j = 0;
- for (int i = state.length; i-- > 0;) {
- if (state[i] == FULL) {
- elements[j++] = values[i];
- }
- }
- }
-
- /**
- * Access for unit tests.
- * @param capacity
- * @param minLoadFactor
- * @param maxLoadFactor
- */
- protected void getInternalFactors(int[] capacity,
- double[] minLoadFactor,
- double[] maxLoadFactor) {
- capacity[0] = table.length;
- minLoadFactor[0] = this.minLoadFactor;
- maxLoadFactor[0] = this.maxLoadFactor;
- }
-}

r***@apache.org

2018-06-27 14:51:45 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala
deleted file mode 100644
index 9e93e63..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.log4j.Level
-
-import org.apache.mahout.logging._
-import org.apache.mahout.math._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.test.MahoutSuite
-import org.scalatest.FunSuite
-
-import scala.math._
-
-class MathSuite extends FunSuite with MahoutSuite {
-
- private final implicit val log = getLog(classOf[MathSuite])
-
- test("chol") {
-
- // try to solve Ax=b with cholesky:
- // this requires
- // (LL')x = B
- // L'x= (L^-1)B
- // x=(L'^-1)(L^-1)B
-
- val a = dense((1, 2, 3), (2, 3, 4), (3, 4, 5.5))
-
- // make sure it is symmetric for a valid solution
- a := a.t %*% a
-
- trace(s"A= \n$a")
-
- val b = dense((9, 8, 7)).t
-
- trace(s"b = \n$b")
-
- // Fails if chol(a, true)
- val ch = chol(a)
-
- trace(s"L = \n${ch.getL}")
-
- trace(s"(L^-1)b =\n${ch.solveLeft(b)}\n")
-
- val x = ch.solveRight(eye(3)) %*% ch.solveLeft(b)
-
- trace(s"x = \n$x")
-
- val axmb = (a %*% x) - b
-
- trace(s"AX - B = \n$axmb")
-
- axmb.norm should be < 1e-10
-
- }
-
- test("chol2") {
-
- val vtv = new DenseSymmetricMatrix(
- Array(
- 0.0021401286568947376, 0.001309251254596442, 0.0016003218703045058,
- 0.001545407014131058, 0.0012772546647977234,
- 0.001747768702674435
- ), true)
-
- printf("V'V=\n%s\n", vtv cloned)
-
- val vblock = dense(
- (0.0012356809018514347, 0.006141139195280868, 8.037742467936037E-4),
- (0.007910767859830255, 0.007989899899005457, 0.006877961936587515),
- (0.007011211118759952, 0.007458865101641882, 0.0048344749320346795),
- (0.006578789899685284, 0.0010812485516549452, 0.0062146270886981655)
- )
-
- val d = diag(15.0, 4)
-
-
- val b = dense(
- 0.36378319648203084,
- 0.3627384439613304,
- 0.2996934112658234)
-
- printf("B=\n%s\n", b)
-
-
- val cholArg = vtv + (vblock.t %*% d %*% vblock) + diag(4e-6, 3)
-
- printf("cholArg=\n%s\n", cholArg)
-
- printf("V'DV=\n%s\n", vblock.t %*% d %*% vblock)
-
- printf("V'V+V'DV=\n%s\n", vtv + (vblock.t %*% d %*% vblock))
-
- val ch = chol(cholArg)
-
- printf("L=\n%s\n", ch.getL)
-
- val x = ch.solveRight(eye(cholArg.nrow)) %*% ch.solveLeft(b)
-
- printf("X=\n%s\n", x)
-
- assert((cholArg %*% x - b).norm < 1e-10)
-
- }
-
- test("qr") {
- val a = dense((1, 2, 3), (2, 3, 6), (3, 4, 5), (4, 7, 8))
- val (q, r) = qr(a)
-
- printf("Q=\n%s\n", q)
- printf("R=\n%s\n", r)
-
- for (i <- 0 until q.ncol; j <- i + 1 until q.ncol)
- assert(abs(q(::, i) dot q(::, j)) < 1e-10)
- }
-
- test("solve matrix-vector") {
- val a = dense((1, 3), (4, 2))
- val b = dvec(11, 14)
- val x = solve(a, b)
-
- val control = dvec(2, 3)
-
- (control - x).norm(2) should be < 1e-10
- }
-
- test("solve matrix-matrix") {
- val a = dense((1, 3), (4, 2))
- val b = dense(11, 14)
- val x = solve(a, b)
-
- val control = dense(2, 3)
-
- (control - x).norm should be < 1e-10
- }
-
- test("solve to obtain inverse") {
- val a = dense((1, 3), (4, 2))
- val x = solve(a)
-
- val identity = a %*% x
-
- val control = eye(identity.ncol)
-
- (control - identity).norm should be < 1e-10
- }
-
- test("solve rejects non-square matrix") {
- intercept[IllegalArgumentException] {
- val a = dense((1, 2, 3), (4, 5, 6))
- val b = dvec(1, 2)
- solve(a, b)
- }
- }
-
- test("solve rejects singular matrix") {
- intercept[IllegalArgumentException] {
- val a = dense((1, 2), (2 , 4))
- val b = dvec(1, 2)
- solve(a, b)
- }
- }
-
- test("svd") {
-
- val a = dense((1, 2, 3), (3, 4, 5))
-
- val (u, v, s) = svd(a)
-
- printf("U:\n%s\n", u.toString)
- printf("V:\n%s\n", v.toString)
- printf("Sigma:\n%s\n", s.toString)
-
- val aBar = u %*% diagv(s) %*% v.t
-
- val amab = a - aBar
-
- printf("A-USV'=\n%s\n", amab.toString)
-
- assert(amab.norm < 1e-10)
-
- }
-
- test("random uniform") {
- val omega1 = Matrices.symmetricUniformView(2, 3, 1234)
- val omega2 = Matrices.symmetricUniformView(2, 3, 1234)
-
- val a = sparse(
- 0 -> 1 :: 1 -> 2 :: Nil,
- 0 -> 3 :: 1 -> 4 :: Nil,
- 0 -> 2 :: 1 -> 0.0 :: Nil
- )
-
- val block = a(0 to 0, ::).cloned
- val block2 = a(1 to 1, ::).cloned
-
- (block %*% omega1 - (a %*% omega2)(0 to 0, ::)).norm should be < 1e-7
- (block2 %*% omega1 - (a %*% omega2)(1 to 1, ::)).norm should be < 1e-7
-
- }
-
- test("sqDist(X,Y)") {
- val m = 100
- val n = 300
- val d = 7
- val mxX = Matrices.symmetricUniformView(m, d, 12345).cloned -= 5
- val mxY = Matrices.symmetricUniformView(n, d, 1234).cloned += 10
-
- val mxDsq = sqDist(mxX, mxY)
- val mxDsqControl = new DenseMatrix(m, n) := { (r, c, _) ⇒ (mxX(r, ::) - mxY(c, ::)) ^= 2 sum }
- (mxDsq - mxDsqControl).norm should be < 1e-7
- }
-
- test("sqDist(X)") {
- val m = 100
- val d = 7
- val mxX = Matrices.symmetricUniformView(m, d, 12345).cloned -= 5
-
- val mxDsq = sqDist(mxX)
- val mxDsqControl = sqDist(mxX, mxX)
- (mxDsq - mxDsqControl).norm should be < 1e-7
- }
-
- test("sparsity analysis") {
- setLogLevel(Level.DEBUG)
-
- val m = 500
- val n = 800
- val mxA = new DenseMatrix(m, n)
-
- densityAnalysis(mxA) shouldBe false
- densityAnalysis(mxA, .5) shouldBe false
- densityAnalysis(mxA + 1) shouldBe true
- densityAnalysis(mxA + 1, .95) shouldBe true
-
- for (i ← 0 until m by 5) mxA(i, ::) := 1
- info(s"20% detected as dense?:${densityAnalysis(mxA)}")
- mxA := 0
-
- for (i ← 0 until m by 3) mxA(i, ::) := 1
- info(s"33% detected as dense?:${densityAnalysis(mxA)}")
- mxA := 0
-
- for (i ← 0 until m by 4) mxA(i, ::) := 1
- info(s"25% detected as dense?:${densityAnalysis(mxA)}")
-
- for (i ← 0 until m by 2) mxA(i, ::) := 1
- info(s"50% detected as dense?:${densityAnalysis(mxA)}")
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOpsSuite.scala
deleted file mode 100644
index 547f710..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatlabLikeMatrixOpsSuite.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.scalatest.FunSuite
-import MatlabLikeOps._
-import scala.Predef._
-import org.apache.mahout.test.MahoutSuite
-
-class MatlabLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
-
- test("multiplication") {
-
- val a = dense((1, 2, 3), (3, 4, 5))
- val b = dense(1, 4, 5)
- val m = a * b
-
- assert(m(0, 0) == 24)
- assert(m(1, 0) == 44)
- println(m.toString)
- }
-
- test("Hadamard") {
- val a = dense(
- (1, 2, 3),
- (3, 4, 5)
- )
- val b = dense(
- (1, 1, 2),
- (2, 1, 1)
- )
-
- val c = a *@ b
-
- printf("C=\n%s\n", c)
-
- assert(c(0, 0) == 1)
- assert(c(1, 2) == 5)
- println(c.toString)
-
- val d = a *@ 5.0
- assert(d(0, 0) == 5)
- assert(d(1, 1) == 20)
-
- a *@= b
- assert(a(0, 0) == 1)
- assert(a(1, 2) == 5)
- println(a.toString)
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
deleted file mode 100644
index 1296d9e..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.scalatest.{Matchers, FunSuite}
-import RLikeOps._
-import scala._
-import org.apache.mahout.test.MahoutSuite
-import org.apache.mahout.math.{RandomAccessSparseVector, SequentialAccessSparseVector, Matrices}
-import org.apache.mahout.common.RandomUtils
-
-import scala.util.Random
-
-
-class MatrixOpsSuite extends FunSuite with MahoutSuite {
-
- test("equivalence") {
- val a = dense((1, 2, 3), (3, 4, 5))
- val b = dense((1, 2, 3), (3, 4, 5))
- val c = dense((1, 4, 3), (3, 4, 5))
- assert(a === b)
- assert(a !== c)
- }
-
- test("elementwise plus, minus") {
- val a = dense((1, 2, 3), (3, 4, 5))
- val b = dense((1, 1, 2), (2, 1, 1))
-
- val c = a + b
- assert(c(0, 0) == 2)
- assert(c(1, 2) == 6)
- println(c.toString)
- }
-
- test("matrix, vector slicing") {
-
- val a = dense((1, 2, 3), (3, 4, 5))
-
- assert(a(::, 0).sum == 4)
- assert(a(1, ::).sum == 12)
-
- assert(a(0 to 1, 1 to 2).sum == 14)
-
- // assign to slice-vector
- a(0, 0 to 1) :=(3, 5)
- // or
- a(0, 0 to 1) = (3, 5)
-
- assert(a(0, ::).sum == 11)
-
- println(a.toString)
-
- // assign to a slice-matrix
- a(0 to 1, 0 to 1) := dense((1, 1), (2, 2.5))
-
- // or
- a(0 to 1, 0 to 1) = dense((1, 1), (2, 2.5))
-
- println(a)
- println(a.sum)
-
- val b = dense((1, 2, 3), (3, 4, 5))
- b(0, ::) -= dvec(1, 2, 3)
- println(b)
- b(0, ::) should equal(dvec(0, 0, 0))
-
- }
-
- test("assignments") {
-
- val a = dense((1, 2, 3), (3, 4, 5))
-
- val b = a cloned
-
- b(0, 0) = 2.0
-
- printf("B=\n%s\n", b)
-
- assert((b - a).norm - 1 < 1e-10)
-
- val e = eye(5)
-
- println(s"I(5)=\n$e")
-
- a(0 to 1, 1 to 2) = dense((3, 2), (2, 3))
- a(0 to 1, 1 to 2) := dense((3, 2), (2, 3))
-
- println(s"a=$a")
-
- a(0 to 1, 1 to 2) := { _ => 45}
- println(s"a=$a")
-
-// a(0 to 1, 1 to 2) ::= { _ => 44}
- println(s"a=$a")
-
- // Sparse assignment to a sparse block
- val c = sparse(0 -> 1 :: Nil, 2 -> 2 :: Nil, 1 -> 5 :: Nil)
- val d = c.cloned
-
- println(s"d=$d")
- d.ncol shouldBe 3
-
- d(::, 1 to 2) ::= { _ => 4}
- println(s"d=$d")
- d(::, 1 to 2).sum shouldBe 8
-
- d ::= {_ => 5}
- d.sum shouldBe 15
-
- val f = c.cloned.t
- f ::= {_ => 6}
- f.sum shouldBe 18
-
- val g = c.cloned
- g(::, 1 until g.nrow) ::= { x => if (x <= 0) 0.0 else 1.0}
- g.sum shouldBe 3
- }
-
- test("functional apply()") {
- val mxA = sparse (
- (1 -> 3) :: (7 -> 7) :: Nil,
- (4 -> 5) :: (5 -> 8) :: Nil
- )
- val mxAControl = mxA cloned
-
- (mxA(x ⇒ x + 1) - (mxAControl + 1)).norm should be < 1e-7
- (mxA(x ⇒ x * 2) - (2 * mxAControl)).norm should be < 1e-7
-
- }
-
- test("sparse") {
-
- val a = sparse((1, 3) :: Nil,
- (0, 2) ::(1, 2.5) :: Nil
- )
- println(a.toString)
- }
-
- test("colSums, rowSums, colMeans, rowMeans, numNonZeroElementsPerColumn") {
- val a = dense(
- (2, 3, 4),
- (3, 4, 5)
- )
-
- a.colSums() should equal(dvec(5, 7, 9))
- a.rowSums() should equal(dvec(9, 12))
- a.colMeans() should equal(dvec(2.5, 3.5, 4.5))
- a.rowMeans() should equal(dvec(3, 4))
- a.numNonZeroElementsPerColumn() should equal(dvec(2,2,2))
- a.numNonZeroElementsPerRow() should equal(dvec(3,3))
-
- }
-
- test("numNonZeroElementsPerColumn and Row") {
- val a = dense(
- (2, 3, 4),
- (3, 4, 5),
- (-5, 0, -1),
- (0, 0, 1)
- )
-
- a.numNonZeroElementsPerColumn() should equal(dvec(3,2,4))
- a.numNonZeroElementsPerRow() should equal(dvec(3,3,2,1))
- }
-
- test("Vector Assignment performance") {
-
- val n = 1000
- val k = (n * 0.1).toInt
- val nIters = 10000
-
- val rnd = RandomUtils.getRandom
-
- val src = new SequentialAccessSparseVector(n)
- for (i <- 0 until k) src(rnd.nextInt(n)) = rnd.nextDouble()
-
- val times = (0 until 50).map { i =>
- val ms = System.currentTimeMillis()
- var j = 0
- while (j < nIters) {
- new SequentialAccessSparseVector(n) := src
- j += 1
- }
- System.currentTimeMillis() - ms
- }
-
- .tail
-
- val avgTime = times.sum.toDouble / times.size
-
- printf("Average assignment seqSparse2seqSparse time: %.3f ms\n", avgTime)
-
- val times2 = (0 until 50).map { i =>
- val ms = System.currentTimeMillis()
- var j = 0
- while (j < nIters) {
- new SequentialAccessSparseVector(n) := (new RandomAccessSparseVector(n) := src)
- j += 1
- }
- System.currentTimeMillis() - ms
- }
-
- .tail
-
- val avgTime2 = times2.sum.toDouble / times2.size
-
- printf("Average assignment seqSparse2seqSparse via Random Access Sparse time: %.3f ms\n", avgTime2)
-
- }
-
-
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
deleted file mode 100644
index 6dc8207..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import java.util
-
-import org.apache.log4j.Level
-import org.apache.mahout.math._
-import org.scalatest.FunSuite
-import RLikeOps._
-import org.apache.mahout.test.MahoutSuite
-import org.apache.mahout.logging._
-import scala.collection.JavaConversions._
-import scala.util.Random
-
-class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
-
- test("multiplication") {
-
- val a = dense((1, 2, 3), (3, 4, 5))
- val b = dense(1, 4, 5)
- val m = a %*% b
-
- assert(m(0, 0) == 24)
- assert(m(1, 0) == 44)
- println(m.toString)
- }
-
- test("Hadamard") {
- val a = dense(
- (1, 2, 3),
- (3, 4, 5)
- )
- val b = dense(
- (1, 1, 2),
- (2, 1, 1)
- )
-
- val c = a * b
-
- printf("C=\n%s\n", c)
-
- assert(c(0, 0) == 1)
- assert(c(1, 2) == 5)
- println(c.toString)
-
- val d = a * 5.0
- assert(d(0, 0) == 5)
- assert(d(1, 1) == 20)
-
- a *= b
- assert(a(0, 0) == 1)
- assert(a(1, 2) == 5)
- println(a.toString)
-
- }
-
- test("Uniform view") {
- val mxUnif = Matrices.symmetricUniformView(5000000, 5000000, 1234)
- }
-
- /** Test dsl overloads over scala operations over matrices */
- test ("scalarOps") {
- val a = dense(
- (1, 2, 3),
- (3, 4, 5)
- )
-
- (10 * a - (10 *: a)).norm shouldBe 0
- (10 + a - (10 +: a)).norm shouldBe 0
- (10 - a - (10 -: a)).norm shouldBe 0
- (10 / a - (10 /: a)).norm shouldBe 0
-
- }
-
- test("Multiplication experimental performance") {
-
- getLog(MMul.getClass).setLevel(Level.DEBUG)
-
- val d = 300
- val n = 3
-
- // Dense row-wise
- val mxAd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) + 1
- val mxBd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) - 1
-
- val rnd = new Random(1234)
-
- // Sparse rows
- val mxAsr = (new SparseRowMatrix(d,
- d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
- val mxBsr = (new SparseRowMatrix(d,
- d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
-
- // Hanging sparse rows
- val mxAs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
- val mxBs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
-
- // DIAGONAL
- val mxD = diagv(dvec(Array.tabulate(d)(_ => rnd.nextGaussian())))
-
- def time(op: => Unit): Long = {
- val ms = System.currentTimeMillis()
- op
- System.currentTimeMillis() - ms
- }
-
-
- // We're not using GPUMMul or OMPMMul in math-scala so dont need to worry about
- // changing it in this method
- def getMmulAvgs(mxA: Matrix, mxB: Matrix, n: Int) = {
-
- var control: Matrix = null
- var mmulVal: Matrix = null
-
- val current = Stream.range(0, n).map { _ => time {control = mxA.times(mxB)} }.sum.toDouble / n
- val experimental = Stream.range(0, n).map { _ => time {mmulVal = MMul(mxA, mxB, None)} }.sum.toDouble / n
- (control - mmulVal).norm should be < 1e-10
- current -> experimental
- }
-
- // Dense matrix tests.
- println(s"Ad %*% Bd: ${getMmulAvgs(mxAd, mxBd, n)}")
- println(s"Ad(::,::) %*% Bd: ${getMmulAvgs(mxAd(0 until mxAd.nrow,::), mxBd, n)}")
- println(s"Ad' %*% Bd: ${getMmulAvgs(mxAd.t, mxBd, n)}")
- println(s"Ad %*% Bd': ${getMmulAvgs(mxAd, mxBd.t, n)}")
- println(s"Ad' %*% Bd': ${getMmulAvgs(mxAd.t, mxBd.t, n)}")
- println(s"Ad'' %*% Bd'': ${getMmulAvgs(mxAd.t.t, mxBd.t.t, n)}")
- println
-
- // Sparse row matrix tests.
- println(s"Asr %*% Bsr: ${getMmulAvgs(mxAsr, mxBsr, n)}")
- println(s"Asr' %*% Bsr: ${getMmulAvgs(mxAsr.t, mxBsr, n)}")
- println(s"Asr %*% Bsr': ${getMmulAvgs(mxAsr, mxBsr.t, n)}")
- println(s"Asr' %*% Bsr': ${getMmulAvgs(mxAsr.t, mxBsr.t, n)}")
- println(s"Asr'' %*% Bsr'': ${getMmulAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
- println
-
- // Sparse matrix tests.
- println(s"Asm %*% Bsm: ${getMmulAvgs(mxAs, mxBs, n)}")
- println(s"Asm' %*% Bsm: ${getMmulAvgs(mxAs.t, mxBs, n)}")
- println(s"Asm %*% Bsm': ${getMmulAvgs(mxAs, mxBs.t, n)}")
- println(s"Asm' %*% Bsm': ${getMmulAvgs(mxAs.t, mxBs.t, n)}")
- println(s"Asm'' %*% Bsm'': ${getMmulAvgs(mxAs.t.t, mxBs.t.t, n)}")
- println
-
- // Mixed sparse matrix tests.
- println(s"Asm %*% Bsr: ${getMmulAvgs(mxAs, mxBsr, n)}")
- println(s"Asm' %*% Bsr: ${getMmulAvgs(mxAs.t, mxBsr, n)}")
- println(s"Asm %*% Bsr': ${getMmulAvgs(mxAs, mxBsr.t, n)}")
- println(s"Asm' %*% Bsr': ${getMmulAvgs(mxAs.t, mxBsr.t, n)}")
- println(s"Asm'' %*% Bsr'': ${getMmulAvgs(mxAs.t.t, mxBsr.t.t, n)}")
- println
-
- println(s"Asr %*% Bsm: ${getMmulAvgs(mxAsr, mxBs, n)}")
- println(s"Asr' %*% Bsm: ${getMmulAvgs(mxAsr.t, mxBs, n)}")
- println(s"Asr %*% Bsm': ${getMmulAvgs(mxAsr, mxBs.t, n)}")
- println(s"Asr' %*% Bsm': ${getMmulAvgs(mxAsr.t, mxBs.t, n)}")
- println(s"Asr'' %*% Bsm'': ${getMmulAvgs(mxAsr.t.t, mxBs.t.t, n)}")
- println
-
- // Mixed dense/sparse
- println(s"Ad %*% Bsr: ${getMmulAvgs(mxAd, mxBsr, n)}")
- println(s"Ad' %*% Bsr: ${getMmulAvgs(mxAd.t, mxBsr, n)}")
- println(s"Ad %*% Bsr': ${getMmulAvgs(mxAd, mxBsr.t, n)}")
- println(s"Ad' %*% Bsr': ${getMmulAvgs(mxAd.t, mxBsr.t, n)}")
- println(s"Ad'' %*% Bsr'': ${getMmulAvgs(mxAd.t.t, mxBsr.t.t, n)}")
- println
-
- println(s"Asr %*% Bd: ${getMmulAvgs(mxAsr, mxBd, n)}")
- println(s"Asr' %*% Bd: ${getMmulAvgs(mxAsr.t, mxBd, n)}")
- println(s"Asr %*% Bd': ${getMmulAvgs(mxAsr, mxBd.t, n)}")
- println(s"Asr' %*% Bd': ${getMmulAvgs(mxAsr.t, mxBd.t, n)}")
- println(s"Asr'' %*% Bd'': ${getMmulAvgs(mxAsr.t.t, mxBd.t.t, n)}")
- println
-
- println(s"Ad %*% Bsm: ${getMmulAvgs(mxAd, mxBs, n)}")
- println(s"Ad' %*% Bsm: ${getMmulAvgs(mxAd.t, mxBs, n)}")
- println(s"Ad %*% Bsm': ${getMmulAvgs(mxAd, mxBs.t, n)}")
- println(s"Ad' %*% Bsm': ${getMmulAvgs(mxAd.t, mxBs.t, n)}")
- println(s"Ad'' %*% Bsm'': ${getMmulAvgs(mxAd.t.t, mxBs.t.t, n)}")
- println
-
- println(s"Asm %*% Bd: ${getMmulAvgs(mxAs, mxBd, n)}")
- println(s"Asm' %*% Bd: ${getMmulAvgs(mxAs.t, mxBd, n)}")
- println(s"Asm %*% Bd': ${getMmulAvgs(mxAs, mxBd.t, n)}")
- println(s"Asm' %*% Bd': ${getMmulAvgs(mxAs.t, mxBd.t, n)}")
- println(s"Asm'' %*% Bd'': ${getMmulAvgs(mxAs.t.t, mxBd.t.t, n)}")
- println
-
- // Diagonal cases
- println(s"Ad %*% D: ${getMmulAvgs(mxAd, mxD, n)}")
- println(s"Asr %*% D: ${getMmulAvgs(mxAsr, mxD, n)}")
- println(s"Asm %*% D: ${getMmulAvgs(mxAs, mxD, n)}")
- println(s"D %*% Ad: ${getMmulAvgs(mxD, mxAd, n)}")
- println(s"D %*% Asr: ${getMmulAvgs(mxD, mxAsr, n)}")
- println(s"D %*% Asm: ${getMmulAvgs(mxD, mxAs, n)}")
- println
-
- println(s"Ad' %*% D: ${getMmulAvgs(mxAd.t, mxD, n)}")
- println(s"Asr' %*% D: ${getMmulAvgs(mxAsr.t, mxD, n)}")
- println(s"Asm' %*% D: ${getMmulAvgs(mxAs.t, mxD, n)}")
- println(s"D %*% Ad': ${getMmulAvgs(mxD, mxAd.t, n)}")
- println(s"D %*% Asr': ${getMmulAvgs(mxD, mxAsr.t, n)}")
- println(s"D %*% Asm': ${getMmulAvgs(mxD, mxAs.t, n)}")
- println
-
- // Self-squared cases
- println(s"Ad %*% Ad': ${getMmulAvgs(mxAd, mxAd.t, n)}")
- println(s"Ad' %*% Ad: ${getMmulAvgs(mxAd.t, mxAd, n)}")
- println(s"Ad' %*% Ad'': ${getMmulAvgs(mxAd.t, mxAd.t.t, n)}")
- println(s"Ad'' %*% Ad': ${getMmulAvgs(mxAd.t.t, mxAd.t, n)}")
-
- }
-
-
- test("elementwise experimental performance") {
-
- val d = 500
- val n = 3
-
- // Dense row-wise
- val mxAd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) + 1
- val mxBd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) - 1
-
- val rnd = new Random(1234)
-
- // Sparse rows
- val mxAsr = (new SparseRowMatrix(d,
- d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
- val mxBsr = (new SparseRowMatrix(d,
- d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
-
- // Hanging sparse rows
- val mxAs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
- val mxBs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
-
- // DIAGONAL
- val mxD = diagv(dvec(Array.tabulate(d)(_ => rnd.nextGaussian())))
-
- def time(op: => Unit): Long = {
- val ms = System.currentTimeMillis()
- op
- System.currentTimeMillis() - ms
- }
-
- def getEWAvgs(mxA: Matrix, mxB: Matrix, n: Int) = {
-
- var control: Matrix = null
- var mmulVal: Matrix = null
-
- val current = Stream.range(0, n).map { _ => time {control = mxA + mxB} }.sum.toDouble / n
- val experimental = Stream.range(0, n).map { _ => time {mmulVal = mxA + mxB} }.sum.toDouble / n
- (control - mmulVal).norm should be < 1e-10
- current -> experimental
- }
-
- // Dense matrix tests.
- println(s"Ad + Bd: ${getEWAvgs(mxAd, mxBd, n)}")
- println(s"Ad' + Bd: ${getEWAvgs(mxAd.t, mxBd, n)}")
- println(s"Ad + Bd': ${getEWAvgs(mxAd, mxBd.t, n)}")
- println(s"Ad' + Bd': ${getEWAvgs(mxAd.t, mxBd.t, n)}")
- println(s"Ad'' + Bd'': ${getEWAvgs(mxAd.t.t, mxBd.t.t, n)}")
- println
-
- // Sparse row matrix tests.
- println(s"Asr + Bsr: ${getEWAvgs(mxAsr, mxBsr, n)}")
- println(s"Asr' + Bsr: ${getEWAvgs(mxAsr.t, mxBsr, n)}")
- println(s"Asr + Bsr': ${getEWAvgs(mxAsr, mxBsr.t, n)}")
- println(s"Asr' + Bsr': ${getEWAvgs(mxAsr.t, mxBsr.t, n)}")
- println(s"Asr'' + Bsr'': ${getEWAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
- println
-
- // Sparse matrix tests.
- println(s"Asm + Bsm: ${getEWAvgs(mxAs, mxBs, n)}")
- println(s"Asm' + Bsm: ${getEWAvgs(mxAs.t, mxBs, n)}")
- println(s"Asm + Bsm': ${getEWAvgs(mxAs, mxBs.t, n)}")
- println(s"Asm' + Bsm': ${getEWAvgs(mxAs.t, mxBs.t, n)}")
- println(s"Asm'' + Bsm'': ${getEWAvgs(mxAs.t.t, mxBs.t.t, n)}")
- println
-
- // Mixed sparse matrix tests.
- println(s"Asm + Bsr: ${getEWAvgs(mxAs, mxBsr, n)}")
- println(s"Asm' + Bsr: ${getEWAvgs(mxAs.t, mxBsr, n)}")
- println(s"Asm + Bsr': ${getEWAvgs(mxAs, mxBsr.t, n)}")
- println(s"Asm' + Bsr': ${getEWAvgs(mxAs.t, mxBsr.t, n)}")
- println(s"Asm'' + Bsr'': ${getEWAvgs(mxAs.t.t, mxBsr.t.t, n)}")
- println
-
- println(s"Asr + Bsm: ${getEWAvgs(mxAsr, mxBs, n)}")
- println(s"Asr' + Bsm: ${getEWAvgs(mxAsr.t, mxBs, n)}")
- println(s"Asr + Bsm': ${getEWAvgs(mxAsr, mxBs.t, n)}")
- println(s"Asr' + Bsm': ${getEWAvgs(mxAsr.t, mxBs.t, n)}")
- println(s"Asr'' + Bsm'': ${getEWAvgs(mxAsr.t.t, mxBs.t.t, n)}")
- println
-
- // Mixed dense/sparse
- println(s"Ad + Bsr: ${getEWAvgs(mxAd, mxBsr, n)}")
- println(s"Ad' + Bsr: ${getEWAvgs(mxAd.t, mxBsr, n)}")
- println(s"Ad + Bsr': ${getEWAvgs(mxAd, mxBsr.t, n)}")
- println(s"Ad' + Bsr': ${getEWAvgs(mxAd.t, mxBsr.t, n)}")
- println(s"Ad'' + Bsr'': ${getEWAvgs(mxAd.t.t, mxBsr.t.t, n)}")
- println
-
- println(s"Asr + Bd: ${getEWAvgs(mxAsr, mxBd, n)}")
- println(s"Asr' + Bd: ${getEWAvgs(mxAsr.t, mxBd, n)}")
- println(s"Asr + Bd': ${getEWAvgs(mxAsr, mxBd.t, n)}")
- println(s"Asr' + Bd': ${getEWAvgs(mxAsr.t, mxBd.t, n)}")
- println(s"Asr'' + Bd'': ${getEWAvgs(mxAsr.t.t, mxBd.t.t, n)}")
- println
-
- println(s"Ad + Bsm: ${getEWAvgs(mxAd, mxBs, n)}")
- println(s"Ad' + Bsm: ${getEWAvgs(mxAd.t, mxBs, n)}")
- println(s"Ad + Bsm': ${getEWAvgs(mxAd, mxBs.t, n)}")
- println(s"Ad' + Bsm': ${getEWAvgs(mxAd.t, mxBs.t, n)}")
- println(s"Ad'' + Bsm'': ${getEWAvgs(mxAd.t.t, mxBs.t.t, n)}")
- println
-
- println(s"Asm + Bd: ${getEWAvgs(mxAs, mxBd, n)}")
- println(s"Asm' + Bd: ${getEWAvgs(mxAs.t, mxBd, n)}")
- println(s"Asm + Bd': ${getEWAvgs(mxAs, mxBd.t, n)}")
- println(s"Asm' + Bd': ${getEWAvgs(mxAs.t, mxBd.t, n)}")
- println(s"Asm'' + Bd'': ${getEWAvgs(mxAs.t.t, mxBd.t.t, n)}")
- println
-
- // Diagonal cases
- println(s"Ad + D: ${getEWAvgs(mxAd, mxD, n)}")
- println(s"Asr + D: ${getEWAvgs(mxAsr, mxD, n)}")
- println(s"Asm + D: ${getEWAvgs(mxAs, mxD, n)}")
- println(s"D + Ad: ${getEWAvgs(mxD, mxAd, n)}")
- println(s"D + Asr: ${getEWAvgs(mxD, mxAsr, n)}")
- println(s"D + Asm: ${getEWAvgs(mxD, mxAs, n)}")
- println
-
- println(s"Ad' + D: ${getEWAvgs(mxAd.t, mxD, n)}")
- println(s"Asr' + D: ${getEWAvgs(mxAsr.t, mxD, n)}")
- println(s"Asm' + D: ${getEWAvgs(mxAs.t, mxD, n)}")
- println(s"D + Ad': ${getEWAvgs(mxD, mxAd.t, n)}")
- println(s"D + Asr': ${getEWAvgs(mxD, mxAsr.t, n)}")
- println(s"D + Asm': ${getEWAvgs(mxD, mxAs.t, n)}")
- println
-
- }
-
- test("dense-view-debug") {
- val d = 500
- // Dense row-wise
- val mxAd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) + 1
- val mxBd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) - 1
-
- mxAd(0 until mxAd.nrow, ::) %*% mxBd
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala
deleted file mode 100644
index f17f08a..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.log4j.{BasicConfigurator, Level}
-import org.apache.mahout.logging._
-import org.apache.mahout.math._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.test.MahoutSuite
-import org.scalatest.FunSuite
-
-class RLikeVectorOpsSuite extends FunSuite with MahoutSuite {
-
- BasicConfigurator.configure()
- private[scalabindings] final implicit val log = getLog(classOf[RLikeVectorOpsSuite])
- setLogLevel(Level.DEBUG)
-
- test("Hadamard") {
- val a: Vector = (1, 2, 3)
- val b = (3, 4, 5)
-
- val c = a * b
- println(c)
- assert(c ===(3, 8, 15))
- }
-
- test("dot-view performance") {
-
- val dv1 = new DenseVector(500) := Matrices.uniformView(1, 500, 1234)(0, ::)
- val dv2 = new DenseVector(500) := Matrices.uniformView(1, 500, 1244)(0, ::)
-
- val nit = 300000
-
- // warm up
- dv1 dot dv2
-
- val dmsStart = System.currentTimeMillis()
- for (i ← 0 until nit)
- dv1 dot dv2
- val dmsMs = System.currentTimeMillis() - dmsStart
-
- val (dvv1, dvv2) = dv1(0 until dv1.length) → dv2(0 until dv2.length)
-
- // Warm up.
- dvv1 dot dvv2
-
- val dvmsStart = System.currentTimeMillis()
- for (i ← 0 until nit)
- dvv1 dot dvv2
- val dvmsMs = System.currentTimeMillis() - dvmsStart
-
- debug(f"dense vector dots:${dmsMs}%.2f ms.")
- debug(f"dense view dots:${dvmsMs}%.2f ms.")
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
deleted file mode 100644
index fe272df..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.scalatest.FunSuite
-import org.apache.mahout.math.{SequentialAccessSparseVector, RandomAccessSparseVector, Vector}
-import RLikeOps._
-import org.apache.mahout.test.MahoutSuite
-
-import scala.util.Random
-
-/** VectorOps Suite */
-class VectorOpsSuite extends FunSuite with MahoutSuite {
-
- test("inline create") {
-
- val sparseVec = svec((5 -> 1) :: (10 -> 2.0) :: Nil)
- println(sparseVec)
-
- assert(sparseVec.size() == 11)
-
- val sparseVec2: Vector = (5 -> 1.0) :: (10 -> 2.0) :: Nil
- println(sparseVec2)
-
- val sparseVec3: Vector = new RandomAccessSparseVector(100) := (5 -> 1.0) :: Nil
- println(sparseVec3)
-
- val sparseVec4 = svec((5 -> 1) :: (10 -> 2.0) :: Nil, 100)
- println(sparseVec4)
-
- assert(sparseVec4.size() == 100)
-
- intercept[IllegalArgumentException] {
- val sparseVec5 = svec((5 -> 1) :: (10 -> 2.0) :: Nil, 10)
- }
-
- val denseVec1: Vector = (1.0, 1.1, 1.2)
- println(denseVec1)
-
- val denseVec2 = dvec(1, 0, 1.1, 1.2)
- println(denseVec2)
- }
-
- test("plus minus") {
-
- val a: Vector = (1, 2, 3)
- val b: Vector = (0 -> 3) :: (1 -> 4) :: (2 -> 5) :: Nil
-
- val c = a + b
- val d = b - a
- val e = -b - a
-
- assert(c ===(4, 6, 8))
- assert(d ===(2, 2, 2))
- assert(e ===(-4, -6, -8))
-
- }
-
- test("dot") {
-
- val a: Vector = (1, 2, 3)
- val b = (3, 4, 5)
-
- val c = a dot b
- println(c)
- assert(c == 26)
-
- }
-
- test ("scalarOps") {
- val a = dvec(1 to 5):Vector
-
- 10 * a shouldBe 10 *: a
- 10 + a shouldBe 10 +: a
- 10 - a shouldBe 10 -: a
- 10 / a shouldBe 10 /: a
-
- }
-
- test("sparse assignment") {
-
- val svec = new SequentialAccessSparseVector(30)
- svec(1) = -0.5
- svec(3) = 0.5
- println(svec)
-
- svec(1 until svec.length) ::= ( _ => 0)
- println(svec)
-
- svec.sum shouldBe 0
-
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
deleted file mode 100644
index 4635e95..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.nlp.tfidf
-
-import org.apache.mahout.math._
-import org.apache.mahout.math.scalabindings._
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-import scala.collection._
-import RLikeOps._
-import scala.math._
-
-
-trait TFIDFtestBase extends DistributedMahoutSuite with Matchers {
- this: FunSuite =>
-
- val epsilon = 1E-6
-
- val documents: List[(Int, String)] = List(
- (1, "the first document contains 5 terms"),
- (2, "document two document contains 4 terms"),
- (3, "document three three terms"),
- (4, "each document including this document contain the term document"))
-
- def createDictionaryAndDfMaps(documents: List[(Int, String)]): (Map[String, Int], Map[Int, Int]) = {
-
- // get a tf count for the entire dictionary
- val dictMap = documents.unzip._2.mkString(" ").toLowerCase.split(" ").groupBy(identity).mapValues(_.length)
-
- // create a dictionary with an index for each term
- val dictIndex = dictMap.zipWithIndex.map(x => x._1._1 -> x._2)
-
- val docFrequencyCount = new Array[Int](dictMap.size)
-
- for (token <- dictMap) {
- for (doc <- documents) {
- // parse the string and get a word then increment the df count for that word
- if (doc._2.toLowerCase.split(" ").contains(token._1)) {
- docFrequencyCount(dictIndex(token._1)) += 1
- }
- }
- }
-
- val docFrequencyMap = docFrequencyCount.zipWithIndex.map(x => x._2 -> x._1).toMap
-
- (dictIndex, docFrequencyMap)
- }
-
- def vectorizeDocument(document: String,
- dictionaryMap: Map[String, Int],
- dfMap: Map[Int, Int], weight: TermWeight = new TFIDF): Vector = {
-
- val wordCounts = document.toLowerCase.split(" ").groupBy(identity).mapValues(_.length)
-
- val vec = new RandomAccessSparseVector(dictionaryMap.size)
-
- val totalDFSize = dictionaryMap.size
- val docSize = wordCounts.size
-
- for (word <- wordCounts) {
- val term = word._1
- if (dictionaryMap.contains(term)) {
- val termFreq = word._2
- val dictIndex = dictionaryMap(term)
- val docFreq = dfMap(dictIndex)
- val currentWeight = weight.calculate(termFreq, docFreq.toInt, docSize, totalDFSize.toInt)
- vec(dictIndex)= currentWeight
- }
- }
- vec
- }
-
- test("TF test") {
-
- val (dictionary, dfMap) = createDictionaryAndDfMaps(documents)
-
- val tf: TermWeight = new TF()
-
- val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, dictionary.size)
-
- for (doc <- documents) {
- vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, dictionary, dfMap, tf)
- }
-
- // corpus:
- // (1, "the first document contains 5 terms"),
- // (2, "document two document contains 4 terms"),
- // (3, "document three three terms"),
- // (4, "each document including this document contain the term document")
-
- // dictonary:
- // (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 -> 6, contain -> 7,
- // each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, the -> 13)
-
- // dfMap:
- // (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 -> 1, 7 -> 1, 3 -> 4,
- // 11 -> 2, 8 -> 1, 4 -> 1)
-
- vectorizedDocuments(0, 0).toInt should be (0)
- vectorizedDocuments(0, 13).toInt should be (1)
- vectorizedDocuments(1, 3).toInt should be (2)
- vectorizedDocuments(3, 3).toInt should be (3)
-
- }
-
-
- test("TFIDF test") {
- val (dictionary, dfMap) = createDictionaryAndDfMaps(documents)
-
- val tfidf: TermWeight = new TFIDF()
-
- val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, dictionary.size)
-
- for (doc <- documents) {
- vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, dictionary, dfMap, tfidf)
- }
-
- // corpus:
- // (1, "the first document contains 5 terms"),
- // (2, "document two document contains 4 terms"),
- // (3, "document three three terms"),
- // (4, "each document including this document contain the term document")
-
- // dictonary:
- // (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 -> 6, contain -> 7,
- // each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, the -> 13)
-
- // dfMap:
- // (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 -> 1, 7 -> 1, 3 -> 4,
- // 11 -> 2, 8 -> 1, 4 -> 1)
-
- abs(vectorizedDocuments(0, 0) - 0.0) should be < epsilon
- abs(vectorizedDocuments(0, 13) - 2.540445) should be < epsilon
- abs(vectorizedDocuments(1, 3) - 2.870315) should be < epsilon
- abs(vectorizedDocuments(3, 3) - 3.515403) should be < epsilon
- }
-
- test("MLlib TFIDF test") {
- val (dictionary, dfMap) = createDictionaryAndDfMaps(documents)
-
- val tfidf: TermWeight = new MLlibTFIDF()
-
- val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, dictionary.size)
-
- for (doc <- documents) {
- vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, dictionary, dfMap, tfidf)
- }
-
- // corpus:
- // (1, "the first document contains 5 terms"),
- // (2, "document two document contains 4 terms"),
- // (3, "document three three terms"),
- // (4, "each document including this document contain the term document")
-
- // dictonary:
- // (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 -> 6, contain -> 7,
- // each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, the -> 13)
-
- // dfMap:
- // (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 -> 1, 7 -> 1, 3 -> 4,
- // 11 -> 2, 8 -> 1, 4 -> 1)
-
- abs(vectorizedDocuments(0, 0) - 0.0) should be < epsilon
- abs(vectorizedDocuments(0, 13) - 1.609437) should be < epsilon
- abs(vectorizedDocuments(1, 3) - 2.197224) should be < epsilon
- abs(vectorizedDocuments(3, 3) - 3.295836) should be < epsilon
- }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala b/math-scala/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala
deleted file mode 100644
index 3538991..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.test
-
-import org.apache.mahout.math.drm.DistributedContext
-import org.scalatest.{Suite, FunSuite, Matchers}
-
-/**
- * Unit tests that use a distributed context to run
- */
-trait DistributedMahoutSuite extends MahoutSuite { this: Suite =>
- protected implicit var mahoutCtx: DistributedContext
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala b/math-scala/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala
deleted file mode 100644
index 7a34aa2..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala
+++ /dev/null
@@ -1,16 +0,0 @@
-package org.apache.mahout.test
-
-import org.scalatest._
-import org.apache.log4j.{Level, Logger, BasicConfigurator}
-
-trait LoggerConfiguration extends BeforeAndAfterAllConfigMap {
- this: Suite =>
-
- override protected def beforeAll(configMap: ConfigMap): Unit = {
- super.beforeAll(configMap)
- BasicConfigurator.resetConfiguration()
- BasicConfigurator.configure()
- Logger.getRootLogger.setLevel(Level.ERROR)
- Logger.getLogger("org.apache.mahout.math.scalabindings").setLevel(Level.DEBUG)
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/test/MahoutSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/test/MahoutSuite.scala b/math-scala/src/test/scala/org/apache/mahout/test/MahoutSuite.scala
deleted file mode 100644
index d3b8a38..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/test/MahoutSuite.scala
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.test
-
-import java.io.File
-import org.scalatest._
-import org.apache.mahout.common.RandomUtils
-
-trait MahoutSuite extends BeforeAndAfterEach with LoggerConfiguration with Matchers {
- this: Suite =>
-
- final val TmpDir = "tmp/"
-
- override protected def beforeEach() {
- super.beforeEach()
- RandomUtils.useTestSeed()
- }
-
- override protected def beforeAll(configMap: ConfigMap) {
- super.beforeAll(configMap)
-
- // just in case there is an existing tmp dir clean it before every suite
- deleteDirectory(new File(TmpDir))
- }
-
- override protected def afterEach() {
-
- // clean the tmp dir after every test
- deleteDirectory(new File(TmpDir))
-
- super.afterEach()
- }
-
- /** Delete directory no symlink checking and exceptions are not caught */
- private def deleteDirectory(path: File): Unit = {
- if (path.isDirectory)
- for (files <- path.listFiles) deleteDirectory(files)
- path.delete
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/pom.xml
----------------------------------------------------------------------
diff --git a/math/pom.xml b/math/pom.xml
deleted file mode 100644
index 9855b9d..0000000
--- a/math/pom.xml
+++ /dev/null
@@ -1,256 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout</artifactId>
- <version>0.13.1-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>mahout-math</artifactId>
- <name>Mahout Math</name>
- <description>High performance scientific and technical computing data structures and methods,
- mostly based on CERN's
- Colt Java API
- </description>
-
- <packaging>jar</packaging>
-
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
-
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
-
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
-
- 
- 
- 
-
- 
- 
- 
-
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
-
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
-
- <dependencies>
-
- 
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-math3</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- </dependency>
-
- <dependency>
- <groupId>it.unimi.dsi</groupId>
- <artifactId>fastutil</artifactId>
- <version>7.0.12</version>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jcl</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-test-framework</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>com.carrotsearch.randomizedtesting</groupId>
- <artifactId>randomizedtesting-runner</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.tdunning</groupId>
- <artifactId>t-digest</artifactId>
- <version>3.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.easymock</groupId>
- <artifactId>easymock</artifactId>
- <scope>test</scope>
- </dependency>
-
- </dependencies>
-</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/buffer/ValueTypeBufferConsumer.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/buffer/ValueTypeBufferConsumer.java.t b/math/src/main/java-templates/org/apache/mahout/math/buffer/ValueTypeBufferConsumer.java.t
deleted file mode 100644
index 3077dfd..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/buffer/ValueTypeBufferConsumer.java.t
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math.buffer;
-
-import org.apache.mahout.math.list.${valueTypeCap}ArrayList;
-/**
- * Object that can accept a primitive array list of
- * ${valueType} items.
- **/
-public interface ${valueTypeCap}BufferConsumer {
-
- /**
- * Adds all elements of the specified list to the receiver.
- *
- * @param list the list of which all elements shall be added.
- */
- void addAllOf(${valueTypeCap}ArrayList list);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeObjectProcedure.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeObjectProcedure.java.t b/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeObjectProcedure.java.t
deleted file mode 100644
index 4ecf714..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeObjectProcedure.java.t
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.function;
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-/**
- * Interface that represents a procedure object: a procedure that takes two arguments and does not return a value.
- *
-*/
-public interface ${keyTypeCap}ObjectProcedure<T> {
-
- /**
- * Applies a procedure to two arguments. Optionally can return a boolean flag to inform the object calling the
- * procedure.
- *
- * Example: forEach() methods often use procedure objects. To signal to a forEach() method whether iteration should
- * continue normally or terminate (because for example a matching element has been found), a procedure can return
- * <tt>false</tt> to indicate termination and <tt>true</tt> to indicate continuation.
- *
- * @param first first argument passed to the procedure.
- * @param second second argument passed to the procedure.
- * @return a flag to inform the object calling the procedure.
- */
- boolean apply(${keyType} first, T second);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeProcedure.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeProcedure.java.t b/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeProcedure.java.t
deleted file mode 100644
index c198353..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeProcedure.java.t
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.function;
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-/**
- * Interface that represents a procedure object: a procedure that takes a single argument and does not return a value.
- *
- */
-public interface ${keyTypeCap}Procedure {
-
- /**
- * Applies a procedure to an argument. Optionally can return a boolean flag to inform the object calling the
- * procedure.
- *
- * Example: forEach() methods often use procedure objects. To signal to a forEach() method whether iteration should
- * continue normally or terminate (because for example a matching element has been found), a procedure can return
- * <tt>false</tt> to indicate termination and <tt>true</tt> to indicate continuation.
- *
- * @param element element passed to the procedure.
- * @return a flag to inform the object calling the procedure.
- */
- boolean apply(${keyType} element);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeValueTypeProcedure.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeValueTypeProcedure.java.t b/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeValueTypeProcedure.java.t
deleted file mode 100644
index cf7ac22..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/function/KeyTypeValueTypeProcedure.java.t
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.mahout.math.function;
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-/**
- * Interface that represents a procedure object: a procedure that takes two arguments and does not return a value.
- *
- */
-public interface ${keyTypeCap}${valueTypeCap}Procedure {
-
- /**
- * Applies a procedure to two arguments. Optionally can return a boolean flag to inform the object calling the
- * procedure.
- *
- * Example: forEach() methods often use procedure objects. To signal to a forEach() method whether iteration should
- * continue normally or terminate (because for example a matching element has been found), a procedure can return
- * <tt>false</tt> to indicate termination and <tt>true</tt> to indicate continuation.
- *
- * @param first first argument passed to the procedure.
- * @param second second argument passed to the procedure.
- * @return a flag to inform the object calling the procedure.
- */
- boolean apply(${keyType} first, ${valueType} second);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/function/ObjectValueTypeProcedure.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/function/ObjectValueTypeProcedure.java.t b/math/src/main/java-templates/org/apache/mahout/math/function/ObjectValueTypeProcedure.java.t
deleted file mode 100644
index e3576a8..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/function/ObjectValueTypeProcedure.java.t
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.mahout.math.function;
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-/**
- * Interface that represents a procedure object: a procedure that takes two arguments and does not return a value.
- *
- */
-public interface Object${valueTypeCap}Procedure<T> {
-
- /**
- * Applies a procedure to two arguments. Optionally can return a boolean flag to inform the object calling the
- * procedure.
- *
- * Example: forEach() methods often use procedure objects. To signal to a forEach() method whether iteration should
- * continue normally or terminate (because for example a matching element has been found), a procedure can return
- * <tt>false</tt> to indicate termination and <tt>true</tt> to indicate continuation.
- *
- * @param first first argument passed to the procedure.
- * @param second second argument passed to the procedure.
- * @return a flag to inform the object calling the procedure.
- */
- boolean apply(T first, ${valueType} second);
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java-templates/org/apache/mahout/math/function/ValueTypeComparator.java.t
----------------------------------------------------------------------
diff --git a/math/src/main/java-templates/org/apache/mahout/math/function/ValueTypeComparator.java.t b/math/src/main/java-templates/org/apache/mahout/math/function/ValueTypeComparator.java.t
deleted file mode 100644
index 9f27905..0000000
--- a/math/src/main/java-templates/org/apache/mahout/math/function/ValueTypeComparator.java.t
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.function;
-
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-
-/**
- * A comparison function which imposes a total ordering on some collection of elements. Comparators can be
- * passed to a sort method (such as <tt>org.apache.mahout.math.Sorting.quickSort</tt>) to allow precise control over
- * the sort order.
- *
- * Note: It is generally a good idea for comparators to implement <tt>java.io.Serializable</tt>, as they may be used as
- * ordering methods in serializable data structures. In order for the data structure to serialize successfully, the
- * comparator (if provided) must implement <tt>Serializable</tt>.
- *
- * @see java.util.Comparator
- * @see org.apache.mahout.math.Sorting
- */
-public interface ${valueTypeCap}Comparator {
-
- /**
- * Compares its two arguments for order. Returns a negative integer, zero, or a positive integer as the first
- * argument is less than, equal to, or greater than the second.
- *
- * The implementor must ensure that <tt>sgn(compare(x, y)) == -sgn(compare(y, x))</tt> for all <tt>x</tt> and
- * <tt>y</tt>. (This implies that <tt>compare(x, y)</tt> must throw an exception if and only if <tt>compare(y,
- * x)</tt> throws an exception.)
- *
- * The implementor must also ensure that the relation is transitive: <tt>((compare(x, y)>0) && (compare(y,
- * z)>0))</tt> implies <tt>compare(x, z)>0</tt>.
- *
- * Finally, the implementer must ensure that <tt>compare(x, y)==0</tt> implies that <tt>sgn(compare(x,
- * z))==sgn(compare(y, z))</tt> for all <tt>z</tt>.
- *
- * @return a negative integer, zero, or a positive integer as the first argument is less than, equal to, or greater
- * than the second.
- */
- int compare(${valueType} o1, ${valueType} o2);
-
- /**
- * Indicates whether some other object is "equal to" this Comparator. This method must obey the general
- * contract of <tt>Object.equals(Object)</tt>. Additionally, this method can return <tt>true</tt> only if the
- * specified Object is also a comparator and it imposes the same ordering as this comparator. Thus,
- * <code>comp1.equals(comp2)</code> implies that <tt>sgn(comp1.compare(o1, o2))==sgn(comp2.compare(o1, o2))</tt> for
- * every element <tt>o1</tt> and <tt>o2</tt>.
- *
- * Note that it is always safe not to override <tt>Object.equals(Object)</tt>. However, overriding this
- * method may, in some cases, improve performance by allowing programs to determine that two distinct Comparators
- * impose the same order.
- *
- * @param obj the reference object with which to compare.
- * @return <code>true</code> only if the specified object is also a comparator and it imposes the same ordering as
- * this comparator.
- * @see Object#hashCode()
- */
- boolean equals(Object obj);
-}

r***@apache.org

2018-06-27 14:51:48 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractBinaryOp.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractBinaryOp.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractBinaryOp.scala
deleted file mode 100644
index ba41657..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractBinaryOp.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.{DistributedContext, DrmLike}
-
-/**
- * Any logical binary operator (such as A + B).
- * 
- *
- * Any logical operator derived from this is also capabile of triggering optimizer checkpoint, hence,
- * it also inherits CheckpointAction.
- * 
- *
- * @tparam A LHS key type
- * @tparam B RHS key type
- * @tparam K result key type
- */
-abstract class AbstractBinaryOp[A, B, K]
- extends CheckpointAction[K] with DrmLike[K] {
-
- protected[drm] var A: DrmLike[A]
-
- protected[drm] var B: DrmLike[B]
-
- lazy val context: DistributedContext = A.context
-
- protected[mahout] def canHaveMissingRows: Boolean = false
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
deleted file mode 100644
index 6a70aec..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.{DistributedContext, DrmLike}
-
-/** Abstract unary operator */
-abstract class AbstractUnaryOp[A, K]
- extends CheckpointAction[K] with DrmLike[K] {
-
- protected[mahout] var A: DrmLike[A]
-
- lazy val context: DistributedContext = A.context
-
- override protected[mahout] lazy val canHaveMissingRows: Boolean = A.canHaveMissingRows
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
deleted file mode 100644
index aa1d8bc..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.util.Random
-import org.apache.mahout.math.drm._
-
-/** Implementation of distributed expression checkpoint and optimizer. */
-abstract class CheckpointAction[K] extends DrmLike[K] {
-
- protected[mahout] lazy val partitioningTag: Long = Random.nextLong()
-
- private[mahout] var cp:Option[CheckpointedDrm[K]] = None
-
- def isIdenticallyPartitioned(other:DrmLike[_]) =
- partitioningTag!= 0L && partitioningTag == other.partitioningTag
-
- /**
- * Action operator -- does not necessary means Spark action; but does mean running BLAS optimizer
- * and writing down Spark graph lineage since last checkpointed DRM.
- */
- def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = cp match {
- case None =>
- implicit val cpTag = this.keyClassTag
- val plan = context.optimizerRewrite(this)
- val physPlan = context.toPhysical(plan, cacheHint)
- cp = Some(physPlan)
- physPlan
- case Some(cp) => cp
- }
-
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAB.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAB.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAB.scala
deleted file mode 100644
index e5316a0..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAB.scala
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-
-/** Logical AB */
-case class OpAB[K](
- override var A: DrmLike[K],
- override var B: DrmLike[Int])
- extends AbstractBinaryOp[K, Int, K] {
-
- assert(A.ncol == B.nrow, "Incompatible operand geometry")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = B.ncol
-
- /** Non-zero element count */
- def nNonZero: Long =
- // TODO: for purposes of cost calculation, approximate based on operands
- throw new UnsupportedOperationException
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABAnyKey.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABAnyKey.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABAnyKey.scala
deleted file mode 100644
index 8437cdd..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABAnyKey.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-
-/** Logical AB */
-case class OpABAnyKey[B, K ](
- override var A: DrmLike[K],
- override var B: DrmLike[B])
- extends AbstractBinaryOp[K, B, K] {
-
- assert(A.ncol == B.nrow, "Incompatible operand geometry")
-
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = B.ncol
-
- /** Non-zero element count */
- def nNonZero: Long =
- // TODO: for purposes of cost calculation, approximate based on operands
- throw new UnsupportedOperationException
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABt.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABt.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABt.scala
deleted file mode 100644
index 63bd7e1..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpABt.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm._
-
-/** Logical AB' */
-case class OpABt[K](
- override var A: DrmLike[K],
- override var B: DrmLike[Int])
- extends AbstractBinaryOp[K,Int,K] {
-
- assert(A.ncol == B.ncol, "Incompatible operand geometry")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override lazy val keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = safeToNonNegInt(B.nrow)
-
- /** Non-zero element count */
- def nNonZero: Long =
- // TODO: for purposes of cost calculation, approximate based on operands
- throw new UnsupportedOperationException
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewB.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewB.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewB.scala
deleted file mode 100644
index 4bb83d0..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewB.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-import scala.util.Random
-
-/** DRM elementwise operator */
-case class OpAewB[K](
- override var A: DrmLike[K],
- override var B: DrmLike[K],
- val op: String
- ) extends AbstractBinaryOp[K, K, K] {
-
-
- assert(A.ncol == B.ncol, "arguments must have same number of columns")
- assert(A.nrow == B.nrow, "arguments must have same number of rows")
- assert(A.keyClassTag == B.keyClassTag, "Arguments of elementwise operators must have the same row key")
-
- override protected[mahout] lazy val partitioningTag: Long =
- if (A.partitioningTag == B.partitioningTag) A.partitioningTag
- else Random.nextLong()
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
deleted file mode 100644
index 4f08686..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-import scala.util.Random
-
-/**
- * Operator denoting expressions like 5.0 - A or A * 5.6
- *
- * @deprecated use [[OpAewUnaryFunc]] instead
- */
-case class OpAewScalar[K](
- override var A: DrmLike[K],
- val scalar: Double,
- val op: String
- ) extends AbstractUnaryOp[K,K] {
-
- override protected[mahout] lazy val partitioningTag: Long =
- if (A.canHaveMissingRows)
- Random.nextLong()
- else A.partitioningTag
-
- /** Stuff like `A +1` is always supposed to fix this */
- override protected[mahout] lazy val canHaveMissingRows: Boolean = false
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
deleted file mode 100644
index 0607686..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-import scala.util.Random
-
-case class OpAewUnaryFunc[K](
- override var A: DrmLike[K],
- val f: (Double) => Double,
- val evalZeros:Boolean = false
- ) extends AbstractUnaryOp[K,K] with TEwFunc {
-
- override protected[mahout] lazy val partitioningTag: Long =
- if (A.canHaveMissingRows)
- Random.nextLong()
- else A.partitioningTag
-
- /** Stuff like `A +1` is always supposed to fix this */
- override protected[mahout] lazy val canHaveMissingRows: Boolean = false
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override lazy val keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
deleted file mode 100644
index 19bdc64..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-import scala.util.Random
-
-/**
- * Composition of unary elementwise functions.
- */
-case class OpAewUnaryFuncFusion[K](
- override var A: DrmLike[K],
- var ff:List[OpAewUnaryFunc[K]] = Nil
- ) extends AbstractUnaryOp[K,K] with TEwFunc {
-
- override protected[mahout] lazy val partitioningTag: Long =
- if (A.canHaveMissingRows)
- Random.nextLong()
- else A.partitioningTag
-
- /** Stuff like `A +1` is always supposed to fix this */
- override protected[mahout] lazy val canHaveMissingRows: Boolean = false
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-
- /** Apply to degenerate elements? */
- override def evalZeros: Boolean = ff.exists(_.evalZeros)
-
- /** the function itself */
- override def f: (Double) => Double = {
-
- // Make sure composed collection becomes an attribute of this closure because we will be sending
- // it to the backend.
- val composedFunc = ff.map(_.f)
-
- // Create functional closure and return.
- (x: Double) => (composedFunc :\ x) { case (f, xarg) => f(xarg)}
-
- }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAt.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAt.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAt.scala
deleted file mode 100644
index 59c71bd..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAt.scala
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm._
-
-import scala.reflect.ClassTag
-
-/** Logical A' */
-case class OpAt(
- override var A: DrmLike[Int])
- extends AbstractUnaryOp[Int, Int] {
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = ClassTag.Int
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.ncol
-
- /** R-like syntax for number of columns */
- def ncol: Int = safeToNonNegInt(A.nrow)
-
- /** A' after simplifications cannot produce missing rows, ever. */
- override protected[mahout] lazy val canHaveMissingRows: Boolean = false
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtA.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtA.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtA.scala
deleted file mode 100644
index 4c01f46..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtA.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-
-/** A'A */
-case class OpAtA[K](
- override var A: DrmLike[K]
- ) extends AbstractUnaryOp[K, Int] {
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = ClassTag.Int
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.ncol
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-
- override protected[mahout] lazy val canHaveMissingRows: Boolean = false
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtAnyKey.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtAnyKey.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtAnyKey.scala
deleted file mode 100644
index b23dca7..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtAnyKey.scala
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm._
-
-/** Logical A' for any row key to support A'A optimizations */
-case class OpAtAnyKey[A](
- override var A: DrmLike[A])
- extends AbstractUnaryOp[A, Int] {
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = ClassTag.Int
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.ncol
-
- /** R-like syntax for number of columns */
- def ncol: Int = safeToNonNegInt(A.nrow)
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtB.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtB.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtB.scala
deleted file mode 100644
index 7ec8585..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtB.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-
-/** Logical A'B */
-case class OpAtB[A](
- override var A: DrmLike[A],
- override var B: DrmLike[A])
- extends AbstractBinaryOp[A, A, Int] {
-
- assert(A.nrow == B.nrow, "Incompatible operand geometry")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = ClassTag.Int
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.ncol
-
- /** R-like syntax for number of columns */
- def ncol: Int = B.ncol
-
- /** Non-zero element count */
- def nNonZero: Long =
- // TODO: for purposes of cost calculation, approximate based on operands
- throw new UnsupportedOperationException
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtx.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtx.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtx.scala
deleted file mode 100644
index 97b6de1..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAtx.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.Vector
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.drm._
-
-import scala.reflect.ClassTag
-
-/** Logical A'x. */
-case class OpAtx(
- override var A: DrmLike[Int],
- val x: Vector
- ) extends AbstractUnaryOp[Int, Int] {
-
- override protected[mahout] lazy val partitioningTag: Long = A.partitioningTag
-
- assert(A.nrow == x.length, "Incompatible operand geometry")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override val keyClassTag = ClassTag.Int
-
- /** R-like syntax for number of rows. */
- def nrow: Long = safeToNonNegInt(A.ncol)
-
- /** R-like syntax for number of columns */
- def ncol: Int = 1
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAx.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAx.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAx.scala
deleted file mode 100644
index d25e0d9..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAx.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.Vector
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.drm.DrmLike
-
-/** Logical Ax. */
-case class OpAx[K](
- override var A: DrmLike[K],
- val x: Vector
- ) extends AbstractUnaryOp[K, K] {
-
- override protected[mahout] lazy val partitioningTag: Long = A.partitioningTag
-
- assert(A.ncol == x.length, "Incompatible operand geometry")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag: ClassTag[K] = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = 1
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
deleted file mode 100644
index cbc20ae..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.DrmLike
-import scala.util.Random
-
-/** cbind() logical operator */
-case class OpCbind[K](
- override var A: DrmLike[K],
- override var B: DrmLike[K]
- ) extends AbstractBinaryOp[K, K, K] {
-
- assert(A.nrow == B.nrow, "arguments must have same number of rows")
- require(A.keyClassTag == B.keyClassTag, "arguments must have same row key")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = A.keyClassTag
-
- override protected[mahout] lazy val partitioningTag: Long =
- if (A.partitioningTag == B.partitioningTag) A.partitioningTag
- else Random.nextLong()
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol + B.ncol
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
deleted file mode 100644
index c3775ed..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.DrmLike
-
-case class OpCbindScalar[K](
- override var A:DrmLike[K],
- var x:Double,
- val leftBind:Boolean ) extends AbstractUnaryOp[K,K] {
-
- override protected[mahout] lazy val canHaveMissingRows: Boolean = false
-
- override protected[mahout] lazy val partitioningTag: Long = A.partitioningTag
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol + 1
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
deleted file mode 100644
index 95e690b..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.{BlockMapFunc, DrmLike}
-
-import scala.reflect.{ClassTag, classTag}
-import scala.util.Random
-
-case class OpMapBlock[S, R: ClassTag](
- override var A: DrmLike[S],
- val bmf: BlockMapFunc[S, R],
- val _ncol: Int = -1,
- val _nrow: Long = -1,
- identicallyPartitioned:Boolean
- ) extends AbstractUnaryOp[S, R] {
-
- override protected[mahout] lazy val partitioningTag: Long =
- if (identicallyPartitioned) A.partitioningTag else Random.nextLong()
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = classTag[R]
-
- /** R-like syntax for number of rows. */
- def nrow: Long = if (_nrow >= 0) _nrow else A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = if (_ncol >= 0) _ncol else A.ncol
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpPar.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpPar.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpPar.scala
deleted file mode 100644
index 2402b1f..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpPar.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.DrmLike
-
-/** Parallelism operator */
-case class OpPar[K](
- override var A: DrmLike[K],
- val minSplits: Int = -1,
- val exactSplits: Int = -1)
- extends AbstractUnaryOp[K, K] {
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRbind.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRbind.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRbind.scala
deleted file mode 100644
index 1c67868..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRbind.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.DrmLike
-import scala.util.Random
-
-/** rbind() logical operator */
-case class OpRbind[K](
- override var A: DrmLike[K],
- override var B: DrmLike[K]
- ) extends AbstractBinaryOp[K, K, K] {
-
- assert(A.ncol == B.ncol, "arguments must have same number of columns")
- require(A.keyClassTag == B.keyClassTag, "arguments of rbind() must have the same row key type")
-
- override protected[mahout] lazy val partitioningTag: Long = Random.nextLong()
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override def keyClassTag = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow + B.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRowRange.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRowRange.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRowRange.scala
deleted file mode 100644
index c7d3bfa..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpRowRange.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.drm.DrmLike
-
-import scala.reflect.ClassTag
-
-/** Logical row-range slicing */
-case class OpRowRange(
- override var A: DrmLike[Int],
- val rowRange: Range
- ) extends AbstractUnaryOp[Int, Int] {
-
- assert(rowRange.head >= 0 && rowRange.last < A.nrow, "row range out of range")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override val keyClassTag = ClassTag.Int
-
- /** R-like syntax for number of rows. */
- def nrow: Long = rowRange.length
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesLeftMatrix.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesLeftMatrix.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesLeftMatrix.scala
deleted file mode 100644
index 016171d..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesLeftMatrix.scala
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.Matrix
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.drm.DrmLike
-
-import scala.reflect.ClassTag
-
-/** Logical Times-left over in-core matrix operand */
-case class OpTimesLeftMatrix(
- left: Matrix,
- override var A: DrmLike[Int]
- ) extends AbstractUnaryOp[Int, Int] {
-
- assert(left.ncol == A.nrow, "Incompatible operand geometry")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override val keyClassTag = ClassTag.Int
-
- /** R-like syntax for number of rows. */
- def nrow: Long = left.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = A.ncol
-
- /** Non-zero element count */
- // TODO
- def nNonZero: Long = throw new UnsupportedOperationException
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesRightMatrix.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesRightMatrix.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesRightMatrix.scala
deleted file mode 100644
index 94104bb..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpTimesRightMatrix.scala
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-import org.apache.mahout.math.Matrix
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.drm.DrmLike
-
-/** Logical times-right over in-core matrix operand. */
-case class OpTimesRightMatrix[K](
- override var A: DrmLike[K],
- val right: Matrix
- ) extends AbstractUnaryOp[K, K] {
-
- override protected[mahout] lazy val partitioningTag: Long = A.partitioningTag
-
- assert(A.ncol == right.nrow, "Incompatible operand geometry")
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- override lazy val keyClassTag = A.keyClassTag
-
- /** R-like syntax for number of rows. */
- def nrow: Long = A.nrow
-
- /** R-like syntax for number of columns */
- def ncol: Int = right.ncol
-
- /** Non-zero element count */
- // TODO
- def nNonZero: Long = throw new UnsupportedOperationException
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
deleted file mode 100644
index 0eb5f65..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm.logical
-
-/**
- * Trait denoting logical operators providing elementwise operations that work as unary operators
- * on each element of a matrix.
- */
-trait TEwFunc {
-
- /** Apply to degenerate elments? */
- def evalZeros: Boolean
-
- /** the function itself */
- def f: (Double) => Double
-
- /**
- * Self assignment ok? If yes, may cause side effects if works off non-serialized cached object
- * tree!
- */
- def selfAssignOk: Boolean = false
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
deleted file mode 100644
index cdec954..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math
-
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.math.scalabindings._
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.logical.OpAewUnaryFunc
-
-import collection._
-
-package object drm {
-
- /** Drm row-wise tuple */
- type DrmTuple[K] = (K, Vector)
-
- /** Drm block-wise tuple: Array of row keys and the matrix block. */
- type BlockifiedDrmTuple[K] = (Array[K], _ <: Matrix)
-
-
- /** Block-map func */
- type BlockMapFunc[S, R] = BlockifiedDrmTuple[S] ⇒ BlockifiedDrmTuple[R]
-
- type BlockMapFunc2[S] = BlockifiedDrmTuple[S] ⇒ Matrix
-
- type BlockReduceFunc = (Matrix, Matrix) ⇒ Matrix
-
- /** CacheHint type */
- // type CacheHint = CacheHint.CacheHint
-
- def safeToNonNegInt(x: Long): Int = {
- assert(x == x << -31 >>> -31, "transformation from long to Int is losing significant bits, or is a negative number")
- x.toInt
- }
-
- /** Broadcast support API */
- def drmBroadcast(m:Matrix)(implicit ctx:DistributedContext):BCast[Matrix] = ctx.drmBroadcast(m)
-
- /** Broadcast support API */
- def drmBroadcast(v:Vector)(implicit ctx:DistributedContext):BCast[Vector] = ctx.drmBroadcast(v)
-
- /** Load DRM from hdfs (as in Mahout DRM format) */
- def drmDfsRead (path: String)(implicit ctx: DistributedContext): CheckpointedDrm[_] = ctx.drmDfsRead(path)
-
- /** Shortcut to parallelizing matrices with indices, ignore row labels. */
- def drmParallelize(m: Matrix, numPartitions: Int = 1)
- (implicit sc: DistributedContext): CheckpointedDrm[Int] = drmParallelizeWithRowIndices(m, numPartitions)(sc)
-
- /** Parallelize in-core matrix as a distributed matrix, using row ordinal indices as data set keys. */
- def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)
- (implicit ctx: DistributedContext): CheckpointedDrm[Int] = ctx.drmParallelizeWithRowIndices(m, numPartitions)
-
- /** Parallelize in-core matrix as a distributed matrix, using row labels as a data set keys. */
- def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)
- (implicit ctx: DistributedContext): CheckpointedDrm[String] = ctx.drmParallelizeWithRowLabels(m, numPartitions)
-
- /** This creates an empty DRM with specified number of partitions and cardinality. */
- def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)
- (implicit ctx: DistributedContext): CheckpointedDrm[Int] = ctx.drmParallelizeEmpty(nrow, ncol, numPartitions)
-
- /** Creates empty DRM with non-trivial height */
- def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)
- (implicit ctx: DistributedContext): CheckpointedDrm[Long] = ctx.drmParallelizeEmptyLong(nrow, ncol, numPartitions)
-
- /** Implicit broadcast -> value conversion. */
- implicit def bcast2val[T](bcast: BCast[T]): T = bcast.value
-
- /** Just throw all engine operations into context as well. */
- implicit def ctx2engine(ctx: DistributedContext): DistributedEngine = ctx.engine
-
- implicit def drm2drmCpOps[K](drm: CheckpointedDrm[K]): CheckpointedOps[K] =
- new CheckpointedOps[K](drm)
-
- /**
- * We assume that whenever computational action is invoked without explicit checkpoint, the user
- * doesn't imply caching
- */
- implicit def drm2Checkpointed[K](drm: DrmLike[K]): CheckpointedDrm[K] = drm.checkpoint(CacheHint.NONE)
-
- /** Implicit conversion to in-core with NONE caching of the result. */
- implicit def drm2InCore[K](drm: DrmLike[K]): Matrix = drm.collect
-
- /** Do vertical concatenation of collection of blockified tuples */
- private[mahout] def rbind[K:ClassTag](blocks: Iterable[BlockifiedDrmTuple[K]]): BlockifiedDrmTuple[K] = {
- assert(blocks.nonEmpty, "rbind: 0 blocks passed in")
- if (blocks.size == 1) {
- // No coalescing required.
- blocks.head
- } else {
- // compute total number of rows in a new block
- val m = blocks.view.map(_._2.nrow).sum
- val n = blocks.head._2.ncol
- val coalescedBlock = blocks.head._2.like(m, n)
- val coalescedKeys = new Array[K](m)
- var row = 0
- for (elem <- blocks.view) {
- val block = elem._2
- val rowEnd = row + block.nrow
- coalescedBlock(row until rowEnd, ::) := block
- elem._1.copyToArray(coalescedKeys, row)
- row = rowEnd
- }
- coalescedKeys -> coalescedBlock
- }
- }
-
- /**
- * Convert arbitrarily-keyed matrix to int-keyed matrix. Some algebra will accept only int-numbered
- * row matrices. So this method is to help.
- *
- * @param drmX input to be transcoded
- * @param computeMap collect `old key -> int key` map to front-end?
- * @tparam K key type
- * @return Sequentially keyed matrix + (optionally) map from non-int key to [[Int]] key. If the
- * key type is actually Int, then we just return the argument with None for the map,
- * regardless of computeMap parameter.
- */
- def drm2IntKeyed[K](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]]) =
- drmX.context.engine.drm2IntKeyed(drmX, computeMap)
-
- /**
- * (Optional) Sampling operation. Consistent with Spark semantics of the same.
- * @param drmX
- * @param fraction
- * @param replacement
- * @tparam K
- * @return samples
- */
- def drmSampleRows[K](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K] =
- drmX.context.engine.drmSampleRows(drmX, fraction, replacement)
-
- def drmSampleKRows[K](drmX: DrmLike[K], numSamples: Int, replacement: Boolean = false): Matrix =
- drmX.context.engine.drmSampleKRows(drmX, numSamples, replacement)
-
- /**
- * Convert a DRM sample into a Tab Separated Vector (TSV) to be loaded into an R-DataFrame
- * for plotting and sketching
- * @param drmX - DRM
- * @param samplePercent - Percentage of Sample elements from the DRM to be fished out for plotting
- * @tparam K
- * @return TSV String
- */
- def drmSampleToTSV[K](drmX: DrmLike[K], samplePercent: Double = 1): String = {
-
- val drmSize = drmX.checkpoint().numRows()
- val sampleRatio: Double = 1.0 * samplePercent / 100
- val numSamples: Int = (drmSize * sampleRatio).toInt
-
- val plotMatrix = drmSampleKRows(drmX, numSamples, replacement = false)
-
- // Plot Matrix rows
- val matrixRows = plotMatrix.numRows()
- val matrixCols = plotMatrix.numCols()
-
- // Convert the Plot Matrix Rows to TSV
- var str = ""
-
- for (i <- 0 until matrixRows) {
- for (j <- 0 until matrixCols) {
- str += plotMatrix(i, j)
- if (j <= matrixCols - 2) {
- str += '\t'
- }
- }
- str += '\n'
- }
-
- str
- }
-
- ///////////////////////////////////////////////////////////
- // Elementwise unary functions on distributed operands.
- def dexp[K](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.exp, true)
-
- def dlog[K](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.log, true)
-
- def dabs[K](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.abs)
-
- def dsqrt[K](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.sqrt)
-
- def dsignum[K](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.signum)
-
- ///////////////////////////////////////////////////////////
- // Misc. math utilities.
-
- /**
- * Compute column wise means and variances -- distributed version.
- *
- * @param drmA Note: will pin input to cache if not yet pinned.
- * @tparam K
- * @return colMeans → colVariances
- */
- def dcolMeanVars[K](drmA: DrmLike[K]): (Vector, Vector) = {
-
- import RLikeDrmOps._
-
- val drmAcp = drmA.checkpoint()
-
- val mu = drmAcp colMeans
-
- // Compute variance using mean(x^2) - mean(x)^2
- val variances = (drmAcp ^ 2 colMeans) -=: mu * mu
-
- mu → variances
- }
-
- /**
- * Compute column wise means and standard deviations -- distributed version.
- * @param drmA note: input will be pinned to cache if not yet pinned
- * @return colMeans → colStdevs
- */
- def dcolMeanStdevs[K](drmA: DrmLike[K]): (Vector, Vector) = {
- val (mu, vars) = dcolMeanVars(drmA)
- mu → (vars ::= math.sqrt _)
- }
-
- /**
- * Thin column-wise mean and covariance matrix computation. Same as [[dcolMeanCov()]] but suited for
- * thin and tall inputs where covariance matrix can be reduced and finalized in driver memory.
- *
- * @param drmA note: will pin input to cache if not yet pinned.
- * @return mean → covariance matrix (in core)
- */
- def dcolMeanCovThin[K: ClassTag](drmA: DrmLike[K]):(Vector, Matrix) = {
-
- import RLikeDrmOps._
-
- val drmAcp = drmA.checkpoint()
- val mu = drmAcp colMeans
- val mxCov = (drmAcp.t %*% drmAcp).collect /= drmAcp.nrow -= (mu cross mu)
- mu → mxCov
- }
-
- /**
- * Compute COV(X) matrix and mean of row-wise data set. X is presented as row-wise input matrix A.
- *
- * This is a "wide" procedure, covariance matrix is returned as a DRM.
- *
- * @param drmA note: will pin input into cache if not yet pinned.
- * @return mean → covariance DRM
- */
- def dcolMeanCov[K: ClassTag](drmA: DrmLike[K]): (Vector, DrmLike[Int]) = {
-
- import RLikeDrmOps._
-
- implicit val ctx = drmA.context
- val drmAcp = drmA.checkpoint()
-
- val bcastMu = drmBroadcast(drmAcp colMeans)
-
- // We use multivaraite analogue COV(X)=E(XX')-mu*mu'. In our case E(XX') = (A'A)/A.nrow.
- // Compute E(XX')
- val drmSigma = (drmAcp.t %*% drmAcp / drmAcp.nrow)
-
- // Subtract mu*mu'. In this case we assume mu*mu' may still be big enough to be treated by
- // driver alone, so we redistribute this operation as well. Hence it may look a bit cryptic.
- .mapBlock() { case (keys, block) ⇒
-
- // Pin mu as vector reference to memory.
- val mu:Vector = bcastMu
-
- keys → (block := { (r, c, v) ⇒ v - mu(keys(r)) * mu(c) })
- }
-
- // return (mu, cov(X) ("bigSigma")).
- (bcastMu: Vector) → drmSigma
- }
-
- /** Distributed Squared distance matrix computation. */
- def dsqDist(drmX: DrmLike[Int]): DrmLike[Int] = {
-
- // This is a specific case of pairwise distances of X and Y.
-
- import RLikeDrmOps._
-
- // Context needed
- implicit val ctx = drmX.context
-
- // Pin to cache if hasn't been pinned yet
- val drmXcp = drmX.checkpoint()
-
- // Compute column sum of squares
- val s = drmXcp ^ 2 rowSums
-
- val sBcast = drmBroadcast(s)
-
- (drmXcp %*% drmXcp.t)
-
- // Apply second part of the formula as per in-core algorithm
- .mapBlock() { case (keys, block) ⇒
-
- // Slurp broadcast to memory
- val s = sBcast: Vector
-
- // Update in-place
- block := { (r, c, x) ⇒ s(keys(r)) + s(c) - 2 * x}
-
- keys → block
- }
- }
-
-
- /**
- * Compute fold-in distances (distributed version). Here, we use pretty much the same math as with
- * squared distances.
- *
- * D_sq = s*1' + 1*t' - 2*X*Y'
- *
- * where s is row sums of hadamard product(X, X), and, similarly,
- * s is row sums of Hadamard product(Y, Y).
- *
- * @param drmX m x d row-wise dataset. Pinned to cache if not yet pinned.
- * @param drmY n x d row-wise dataset. Pinned to cache if not yet pinned.
- * @return m x d pairwise squared distance matrix (between rows of X and Y)
- */
- def dsqDist(drmX: DrmLike[Int], drmY: DrmLike[Int]): DrmLike[Int] = {
-
- import RLikeDrmOps._
-
- implicit val ctx = drmX.context
-
- val drmXcp = drmX.checkpoint()
- val drmYcp = drmY.checkpoint()
-
- val sBcast = drmBroadcast(drmXcp ^ 2 rowSums)
- val tBcast = drmBroadcast(drmYcp ^ 2 rowSums)
-
- (drmX %*% drmY.t)
-
- // Apply the rest of the formula
- .mapBlock() { case (keys, block) =>
-
- // Cache broadcast representations in local task variable
- val s = sBcast: Vector
- val t = tBcast: Vector
-
- block := { (r, c, x) => s(keys(r)) + t(c) - 2 * x}
- keys → block
- }
- }
-}
-
-package object indexeddataset {
- /** Load IndexedDataset from text delimited files */
- def indexedDatasetDFSRead(src: String,
- schema: Schema = DefaultIndexedDatasetReadSchema,
- existingRowIDs: Option[BiDictionary] = None)
- (implicit ctx: DistributedContext):
- IndexedDataset = ctx.indexedDatasetDFSRead(src, schema, existingRowIDs)
-
- def indexedDatasetDFSReadElements(src: String,
- schema: Schema = DefaultIndexedDatasetReadSchema,
- existingRowIDs: Option[BiDictionary] = None)
- (implicit ctx: DistributedContext):
- IndexedDataset = ctx.indexedDatasetDFSReadElements(src, schema, existingRowIDs)
-
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/BiMap.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/BiMap.scala b/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/BiMap.scala
deleted file mode 100644
index 6c0d432..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/BiMap.scala
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.indexeddataset
-
-import scala.collection.immutable.HashMap
-
-/**
- * Immutable Bi-directional Map.
- * @param m Map to use for forward reference
- * @param i optional reverse map of value to key, will create one lazily if none is provided
- * and is required to have no duplicate reverse mappings.
- */
-class BiMap[K, V] (
- private val m: Map[K, V],
- // if this is serialized we allow i to be discarded and recalculated when deserialized
- @transient private var i: Option[BiMap[V, K]] = None
- ) extends Serializable {
-
- // NOTE: make inverse's inverse point back to current BiMap
- // if this is serialized we allow inverse to be discarded and recalculated when deserialized
- @transient lazy val inverse: BiMap[V, K] = {
- if( i == null.asInstanceOf[Option[BiMap[V, K]]] )
- i = None
- i.getOrElse {
- val rev = m.map(_.swap)
- require((rev.size == m.size), "Failed to create reversed map. Cannot have duplicated values.")
- new BiMap(rev, Some(this))
- }
- }
-
- // forces inverse to be calculated in the constructor when deserialized
- // not when first used
- @transient val size_ = inverse.size
-
- def get(k: K): Option[V] = m.get(k)
-
- def getOrElse(k: K, default: => V): V = m.getOrElse(k, default)
-
- def contains(k: K): Boolean = m.contains(k)
-
- def apply(k: K): V = m.apply(k)
-
- /**
- * Converts to a map.
- * @return a map of type immutable.Map[K, V]
- */
- def toMap: Map[K, V] = m
-
- /**
- * Converts to a sequence.
- * @return a sequence containing all elements of this map
- */
- def toSeq: Seq[(K, V)] = m.toSeq
-
- def size: Int = m.size
-
- def take(n: Int) = BiMap(m.take(n))
-
- override def toString = m.toString
-}
-
-object BiMap {
-
- /** Extra constructor from a map */
- def apply[K, V](x: Map[K, V]): BiMap[K, V] = new BiMap(x)
-
-}
-
-/** BiDictionary is a specialized BiMap that has non-negative Ints as values for use as DRM keys */
-class BiDictionary (
- private val m: Map[String, Int],
- @transient private val i: Option[BiMap[Int, String]] = None )
- extends BiMap[String, Int](m, i) {
-
- /**
- * Create a new BiDictionary with the keys supplied and values ranging from 0 to size -1
- * @param keys a set of String
- */
- def this(keys: Seq[String]) = {
- this(HashMap(keys.view.zipWithIndex: _*))
- }
-
- def merge(
- keys: Seq[String]): BiDictionary = {
-
- var newIDs = List[String]()
-
- for (key <- keys) {
- if (!m.contains(key)) newIDs = key +: newIDs
- }
- if(newIDs.isEmpty) this else new BiDictionary(m ++ HashMap(newIDs.view.zip (Stream from size): _*))
-
- }
-
-}
-
-/** BiDictionary is a specialized BiMap that has non-negative Ints as values for use as DRM keys.
- * The companion object provides modification methods specific to maintaining contiguous Int values
- * and unique String keys */
-object BiDictionary {
-
- /**
- * Append new keys to an existing BiDictionary and return the result. The values will start
- * at m.size and increase to create a continuous non-zero value set from 0 to size - 1
- * @param keys new keys to append, not checked for uniqueness so may be dangerous
- * @param biDi merge keys to this BiDictionary and create new values buy incremeting from the highest Int value
- * @return a BiDictionary with added mappings
- */
- /*def append(keys: Seq[String], biDi: BiDictionary): BiDictionary = {
- val hm = HashMap(keys.view.zip (Stream from biDi.size): _*)
- new BiDictionary(biDi.m ++ hm)
- }*/
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/IndexedDataset.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/IndexedDataset.scala b/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/IndexedDataset.scala
deleted file mode 100644
index eeca736..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/IndexedDataset.scala
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.indexeddataset
-
-import org.apache.mahout.math.drm.{DistributedContext, CheckpointedDrm}
-
-/**
- * Wrap an [[org.apache.mahout.math.drm.DrmLike]] with bidirectional ID mappings [[org.apache.mahout.math.indexeddataset.BiDictionary]]
- * so a user specified labels/IDs can be stored and mapped to and from the Mahout Int ID used internal to Mahout
- * core code.
- * @todo Often no need for both or perhaps either dictionary, so save resources by allowing to be not created
- * when not needed.
- */
-
-trait IndexedDataset {
- val matrix: CheckpointedDrm[Int]
- val rowIDs: BiDictionary
- val columnIDs: BiDictionary
-
- /**
- * Write a text delimited file(s) with the row and column IDs from dictionaries.
- * @param dest write location, usually a directory
- * @param schema params to control writing
- * @param sc the [[org.apache.mahout.math.drm.DistributedContext]] used to do a distributed write
- */
- def dfsWrite(dest: String, schema: Schema)(implicit sc: DistributedContext): Unit
-
- /** Factory method, creates the extending class and returns a new instance */
- def create(matrix: CheckpointedDrm[Int], rowIDs: BiDictionary, columnIDs: BiDictionary):
- IndexedDataset
-
- /**
- * Adds the equivalent of blank rows to the sparse CheckpointedDrm, which only changes the row cardinality value.
- * No changes are made to the underlying drm.
- * @param n number to use for new row cardinality, should be larger than current
- * @return a new IndexedDataset or extending class with new cardinality
- * @note should be done before any optimizer actions are performed on the matrix or you'll get unpredictable
- * results.
- */
- def newRowCardinality(n: Int): IndexedDataset = {
- // n is validated in matrix
- this.create(matrix.newRowCardinality(n), rowIDs, columnIDs)
- }
-
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/ReaderWriter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/ReaderWriter.scala b/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/ReaderWriter.scala
deleted file mode 100644
index 65c0d8f..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/ReaderWriter.scala
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.indexeddataset
-
-import org.apache.mahout.math.drm.DistributedContext
-import org.apache.mahout.math.indexeddataset
-
-/**
- * Reader trait is abstract in the sense that the elementReader and rowReader functions must be supplied by an
- * extending trait, which also defines the type to be read.
- * @tparam T type of object to read.
- */
-trait Reader[T]{
-
- val mc: DistributedContext
- val readSchema: Schema
-
- /**
- * Override in extending trait to supply T and perform a parallel read of collection elements
- * @param mc a [[org.apache.mahout.math.drm.DistributedContext]] to read from
- * @param readSchema map of parameters controlling formating and how the read is executed
- * @param source list of comma delimited files to read from
- * @param existingRowIDs [[indexeddataset.BiDictionary]] containing row IDs that have already
- * been applied to this collection--used to synchronize row IDs between several
- * collections
- * @return a new collection of type T
- */
- protected def elementReader(
- mc: DistributedContext,
- readSchema: Schema,
- source: String,
- existingRowIDs: Option[BiDictionary] = None): T
-
- /**
- * Override in extending trait to supply T and perform a parallel read of collection rows
- * @param mc a [[org.apache.mahout.math.drm.DistributedContext]] to read from
- * @param readSchema map of parameters controlling formating and how the read is executed
- * @param source list of comma delimited files to read from
- * @param existingRowIDs [[indexeddataset.BiDictionary]] containing row IDs that have already
- * been applied to this collection--used to synchronize row IDs between several
- * collections
- * @return a new collection of type T
- */
- protected def rowReader(
- mc: DistributedContext,
- readSchema: Schema,
- source: String,
- existingRowIDs: Option[BiDictionary] = None): T
-
- /**
- * Public method called to perform the element-wise read. Usually no need to override
- * @param source comma delimited URIs to read from
- * @param existingRowIDs a [[indexeddataset.BiDictionary]] containing previously used id mappings--used
- * to synchronize all row ids is several collections
- * @return a new collection of type T
- */
- def readElementsFrom(
- source: String,
- existingRowIDs: Option[BiDictionary] = None): T =
- elementReader(mc, readSchema, source, existingRowIDs)
-
- /**
- * Public method called to perform the row-wise read. Usually no need to override.
- * @param source comma delimited URIs to read from
- * @param existingRowIDs a [[indexeddataset.BiDictionary]] containing previously used id mappings--used
- * to synchronize all row ids is several collections
- * @return a new collection of type T
- */
- def readRowsFrom(
- source: String,
- existingRowIDs: Option[BiDictionary] = None): T =
- rowReader(mc, readSchema, source, existingRowIDs)
-}
-
-/**
- * Writer trait is abstract in the sense that the writer method must be supplied by an extending trait,
- * which also defines the type to be written.
- * @tparam T type of object to write, usually a matrix type thing.
- */
-trait Writer[T]{
-
- val mc: DistributedContext
- val sort: Boolean
- val writeSchema: Schema
-
- /**
- * Override to provide writer method
- * @param mc context used to do distributed write
- * @param writeSchema map with params to control format and execution of the write
- * @param dest root directory to write to
- * @param collection usually a matrix like collection to write
- * @param sort flags whether to sort the rows by value descending
- */
- protected def writer(mc: DistributedContext, writeSchema: Schema, dest: String, collection: T, sort: Boolean): Unit
-
- /**
- * Call this method to perform the write, usually no need to override.
- * @param collection what to write
- * @param dest root directory to write to
- */
- def writeTo(collection: T, dest: String) = writer(mc, writeSchema, dest, collection, sort)
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/Schema.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/Schema.scala b/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/Schema.scala
deleted file mode 100644
index b7f120b..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/indexeddataset/Schema.scala
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.indexeddataset
-
-import scala.collection.mutable.HashMap
-
-/**
- * Syntactic sugar for mutable.HashMap[String, Any]
- * @param params list of mappings for instantiation {{{val mySchema = new Schema("one" -> 1, "two" -> "2", ...)}}}
- */
-class Schema(params: Tuple2[String, Any]*) extends HashMap[String, Any] {
- // note: this require a mutable HashMap, do we care?
- this ++= params
-
- /**
- * Constructor for copying an existing Schema
- * @param schemaToClone return a copy of this Schema
- */
- def this(schemaToClone: Schema){
- this()
- this ++= schemaToClone
- }
-}
-
-// These can be used to keep the text in and out fairly standard to Mahout, where an application specific
-// format is not required. These apply to formatting of [[org.apache.mahout.math.indexeddataset.IndexedDataset]]
-// which can be used to create a Mahout DRM for DSL ops.
-
-/**
- * Simple default Schema for typical text delimited element file input
- * This tells the reader to input elements of the default (rowID<comma, tab, or space>columnID
- * <comma, tab, or space>here may be other ignored text...)
- */
-object DefaultIndexedDatasetElementReadSchema extends Schema(
- "delim" -> "[,\t ]", //comma, tab or space
- "filter" -> "",
- "rowIDColumn" -> 0,
- "columnIDPosition" -> 1,
- "filterColumn" -> -1)
-
-/**
- * Default Schema for text delimited [[org.apache.mahout.math.indexeddataset.IndexedDataset]] file output with
- * one row per line.
- * The default form:
- * (rowID<tab>columnID1:score1<space>columnID2:score2...)
- */
-object DefaultIndexedDatasetWriteSchema extends Schema(
- "rowKeyDelim" -> "\t",
- "columnIdStrengthDelim" -> ":",
- "elementDelim" -> " ",
- "omitScore" -> false)
-
-/**
- * Default Schema for typical text delimited [[org.apache.mahout.math.indexeddataset.IndexedDataset]] file
- * row-wise input. This tells the reader to input text lines of the form:
- * (rowID<tab>columnID1:score1,columnID2:score2,...)
- */
-object DefaultIndexedDatasetReadSchema extends Schema(
- "rowKeyDelim" -> "\t",
- "columnIdStrengthDelim" -> ":",
- "elementDelim" -> " ",
- "omitScore" -> false)
-
-/**
- * Default Schema for reading a text delimited [[org.apache.mahout.math.indexeddataset.IndexedDataset]] file where
- * the score of any element is ignored.
- * This tells the reader to input DRM lines of the form
- * (rowID<tab>columnID1:score1<space>columnID2:score2...) remember the score is ignored.
- * Alternatively the format can be
- * (rowID<tab>columnID1<space>columnID2 ...) where presence indicates a score of 1. This is the default
- * output format for [[IndexedDatasetWriteBooleanSchema]]
- */
-object IndexedDatasetReadBooleanSchema extends Schema(
- "rowKeyDelim" -> "\t",
- "columnIdStrengthDelim" -> ":",
- "elementDelim" -> " ",
- "omitScore" -> true)
-
-/**
- * Default Schema for typical text delimited [[org.apache.mahout.math.indexeddataset.IndexedDataset]] file output
- * where the score of a element is omitted. This tells the writer to output
- * [[org.apache.mahout.math.indexeddataset.IndexedDataset]] row of the form
- * (rowID<tab>columnID1<space>columnID2...)
- */
-object IndexedDatasetWriteBooleanSchema extends Schema(
- "rowKeyDelim" -> "\t",
- "columnIdStrengthDelim" -> ":",
- "elementDelim" -> " ",
- "omitScore" -> true)
-

r***@apache.org

2018-06-27 14:51:46 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala b/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala
deleted file mode 100644
index c8f8a90..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.naivebayes
-
-import org.apache.mahout.math._
-import org.apache.mahout.math.scalabindings._
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.apache.mahout.test.MahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-import collection._
-import JavaConversions._
-import collection.JavaConversions
-
-trait NBTestBase extends DistributedMahoutSuite with Matchers { this:FunSuite =>
-
- val epsilon = 1E-6
-
- test("Simple Standard NB Model") {
-
- // test from simulated sparse TF-IDF data
- val inCoreTFIDF = sparse(
- (0, 0.7) ::(1, 0.1) ::(2, 0.1) ::(3, 0.3) :: Nil,
- (0, 0.4) ::(1, 0.4) ::(2, 0.1) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.0) ::(2, 0.8) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.1) ::(2, 0.1) ::(3, 0.7) :: Nil
- )
-
- val TFIDFDrm = drm.drmParallelize(m = inCoreTFIDF, numPartitions = 2)
-
- val labelIndex = new java.util.HashMap[String,Integer]()
- labelIndex.put("Cat1", 3)
- labelIndex.put("Cat2", 2)
- labelIndex.put("Cat3", 1)
- labelIndex.put("Cat4", 0)
-
- // train a Standard NB Model
- val model = NaiveBayes.train(TFIDFDrm, labelIndex, false)
-
- // validate the model- will throw an exception if model is invalid
- model.validate()
-
- // check the labelWeights
- model.labelWeight(0) - 1.2 should be < epsilon
- model.labelWeight(1) - 1.0 should be < epsilon
- model.labelWeight(2) - 1.0 should be < epsilon
- model.labelWeight(3) - 1.0 should be < epsilon
-
- // check the Feature weights
- model.featureWeight(0) - 1.3 should be < epsilon
- model.featureWeight(1) - 0.6 should be < epsilon
- model.featureWeight(2) - 1.1 should be < epsilon
- model.featureWeight(3) - 1.2 should be < epsilon
- }
-
- test("NB Aggregator") {
-
- val rowBindings = new java.util.HashMap[String,Integer]()
- rowBindings.put("/Cat1/doc_a/", 0)
- rowBindings.put("/Cat2/doc_b/", 1)
- rowBindings.put("/Cat1/doc_c/", 2)
- rowBindings.put("/Cat2/doc_d/", 3)
- rowBindings.put("/Cat1/doc_e/", 4)
-
-
- val matrixSetup = sparse(
- (0, 0.1) ::(1, 0.0) ::(2, 0.1) ::(3, 0.0) :: Nil,
- (0, 0.0) ::(1, 0.1) ::(2, 0.0) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.0) ::(2, 0.1) ::(3, 0.0) :: Nil,
- (0, 0.0) ::(1, 0.1) ::(2, 0.0) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.0) ::(2, 0.1) ::(3, 0.0) :: Nil
- )
-
-
- matrixSetup.setRowLabelBindings(rowBindings)
-
- val TFIDFDrm = drm.drmParallelizeWithRowLabels(m = matrixSetup, numPartitions = 2)
-
- val (labelIndex, aggregatedTFIDFDrm) = NaiveBayes.extractLabelsAndAggregateObservations(TFIDFDrm)
-
- labelIndex.size should be (2)
-
- val cat1=labelIndex("Cat1")
- val cat2=labelIndex("Cat2")
-
- cat1 should be (0)
- cat2 should be (1)
-
- val aggregatedTFIDFInCore = aggregatedTFIDFDrm.collect
- aggregatedTFIDFInCore.numCols should be (4)
- aggregatedTFIDFInCore.numRows should be (2)
-
- aggregatedTFIDFInCore.get(cat1, 0) - 0.3 should be < epsilon
- aggregatedTFIDFInCore.get(cat1, 1) - 0.0 should be < epsilon
- aggregatedTFIDFInCore.get(cat1, 2) - 0.3 should be < epsilon
- aggregatedTFIDFInCore.get(cat1, 3) - 0.0 should be < epsilon
- aggregatedTFIDFInCore.get(cat2, 0) - 0.0 should be < epsilon
- aggregatedTFIDFInCore.get(cat2, 1) - 0.2 should be < epsilon
- aggregatedTFIDFInCore.get(cat2, 2) - 0.0 should be < epsilon
- aggregatedTFIDFInCore.get(cat2, 3) - 0.2 should be < epsilon
-
- }
-
- test("Model DFS Serialization") {
-
- // test from simulated sparse TF-IDF data
- val inCoreTFIDF = sparse(
- (0, 0.7) ::(1, 0.1) ::(2, 0.1) ::(3, 0.3) :: Nil,
- (0, 0.4) ::(1, 0.4) ::(2, 0.1) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.0) ::(2, 0.8) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.1) ::(2, 0.1) ::(3, 0.7) :: Nil
- )
-
- val labelIndex = new java.util.HashMap[String,Integer]()
- labelIndex.put("Cat1", 0)
- labelIndex.put("Cat2", 1)
- labelIndex.put("Cat3", 2)
- labelIndex.put("Cat4", 3)
-
- val TFIDFDrm = drm.drmParallelize(m = inCoreTFIDF, numPartitions = 2)
-
- // train a Standard NB Model- no label index here
- val model = NaiveBayes.train(TFIDFDrm, labelIndex, false)
-
- // validate the model- will throw an exception if model is invalid
- model.validate()
-
- // save the model
- model.dfsWrite(TmpDir)
-
- // reload a new model which should be equal to the original
- // this will automatically trigger a validate() call
- val materializedModel= NBModel.dfsRead(TmpDir)
-
-
- // check the labelWeights
- model.labelWeight(0) - materializedModel.labelWeight(0) should be < epsilon //1.2
- model.labelWeight(1) - materializedModel.labelWeight(1) should be < epsilon //1.0
- model.labelWeight(2) - materializedModel.labelWeight(2) should be < epsilon //1.0
- model.labelWeight(3) - materializedModel.labelWeight(3) should be < epsilon //1.0
-
- // check the Feature weights
- model.featureWeight(0) - materializedModel.featureWeight(0) should be < epsilon //1.3
- model.featureWeight(1) - materializedModel.featureWeight(1) should be < epsilon //0.6
- model.featureWeight(2) - materializedModel.featureWeight(2) should be < epsilon //1.1
- model.featureWeight(3) - materializedModel.featureWeight(3) should be < epsilon //1.2
-
- // check to se if the new model is complementary
- materializedModel.isComplementary should be (model.isComplementary)
-
- // check the label indexMaps
- for(elem <- model.labelIndex){
- model.labelIndex(elem._1) == materializedModel.labelIndex(elem._1) should be (true)
- }
- }
-
- test("train and test a model") {
-
- // test from simulated sparse TF-IDF data
- val inCoreTFIDF = sparse(
- (0, 0.7) ::(1, 0.1) ::(2, 0.1) ::(3, 0.3) :: Nil,
- (0, 0.4) ::(1, 0.4) ::(2, 0.1) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.0) ::(2, 0.8) ::(3, 0.1) :: Nil,
- (0, 0.1) ::(1, 0.1) ::(2, 0.1) ::(3, 0.7) :: Nil
- )
-
- val labelIndex = new java.util.HashMap[String,Integer]()
- labelIndex.put("/Cat1/", 0)
- labelIndex.put("/Cat2/", 1)
- labelIndex.put("/Cat3/", 2)
- labelIndex.put("/Cat4/", 3)
-
- val TFIDFDrm = drm.drmParallelize(m = inCoreTFIDF, numPartitions = 2)
-
- // train a Standard NB Model- no label index here
- val model = NaiveBayes.train(TFIDFDrm, labelIndex, false)
-
- // validate the model- will throw an exception if model is invalid
- model.validate()
-
- // save the model
- model.dfsWrite(TmpDir)
-
- // reload a new model which should be equal to the original
- // this will automatically trigger a validate() call
- val materializedModel= NBModel.dfsRead(TmpDir)
-
-
- // check to se if the new model is complementary
- materializedModel.isComplementary should be (model.isComplementary)
-
- // check the label indexMaps
- for(elem <- model.labelIndex){
- model.labelIndex(elem._1) == materializedModel.labelIndex(elem._1) should be (true)
- }
-
-
- //self test on the original set
- val inCoreTFIDFWithLabels = inCoreTFIDF.clone()
- inCoreTFIDFWithLabels.setRowLabelBindings(labelIndex)
- val TFIDFDrmWithLabels = drm.drmParallelizeWithRowLabels(m = inCoreTFIDFWithLabels, numPartitions = 2)
-
- NaiveBayes.test(materializedModel,TFIDFDrmWithLabels , false)
-
- }
-
- test("train and test a model with the confusion matrix") {
-
- val rowBindings = new java.util.HashMap[String,Integer]()
- rowBindings.put("/Cat1/doc_a/", 0)
- rowBindings.put("/Cat2/doc_b/", 1)
- rowBindings.put("/Cat1/doc_c/", 2)
- rowBindings.put("/Cat2/doc_d/", 3)
- rowBindings.put("/Cat1/doc_e/", 4)
- rowBindings.put("/Cat2/doc_f/", 5)
- rowBindings.put("/Cat1/doc_g/", 6)
- rowBindings.put("/Cat2/doc_h/", 7)
- rowBindings.put("/Cat1/doc_i/", 8)
- rowBindings.put("/Cat2/doc_j/", 9)
-
- val seed = 1
-
- val matrixSetup = Matrices.uniformView(10, 50 , seed)
-
- println("TFIDF matrix")
- println(matrixSetup)
-
- matrixSetup.setRowLabelBindings(rowBindings)
-
- val TFIDFDrm = drm.drmParallelizeWithRowLabels(matrixSetup)
-
- // println("Parallelized and Collected")
- // println(TFIDFDrm.collect)
-
- val (labelIndex, aggregatedTFIDFDrm) = NaiveBayes.extractLabelsAndAggregateObservations(TFIDFDrm)
-
- println("Aggregated by key")
- println(aggregatedTFIDFDrm.collect)
- println(labelIndex)
-
-
- // train a Standard NB Model- no label index here
- val model = NaiveBayes.train(aggregatedTFIDFDrm, labelIndex, false)
-
- // validate the model- will throw an exception if model is invalid
- model.validate()
-
- // save the model
- model.dfsWrite(TmpDir)
-
- // reload a new model which should be equal to the original
- // this will automatically trigger a validate() call
- val materializedModel= NBModel.dfsRead(TmpDir)
-
- // check to se if the new model is complementary
- materializedModel.isComplementary should be (model.isComplementary)
-
- // check the label indexMaps
- for(elem <- model.labelIndex){
- model.labelIndex(elem._1) == materializedModel.labelIndex(elem._1) should be (true)
- }
-
- // val testTFIDFDrm = drm.drmParallelizeWithRowLabels(m = matrixSetup, numPartitions = 2)
-
- // self test on this model
- val result = NaiveBayes.test(materializedModel, TFIDFDrm , false)
-
- println(result)
-
- result.getConfusionMatrix.getMatrix.getQuick(0, 0) should be(5)
- result.getConfusionMatrix.getMatrix.getQuick(0, 1) should be(0)
- result.getConfusionMatrix.getMatrix.getQuick(1, 0) should be(0)
- result.getConfusionMatrix.getMatrix.getQuick(1, 1) should be(5)
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala b/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala
deleted file mode 100644
index eafde11..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.stats
-
-import java.lang.Double
-import java.util.Random
-import java.util.Arrays
-
-import org.apache.mahout.common.RandomUtils
-import org.apache.mahout.math.Matrix
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-
-
-
-trait ClassifierStatsTestBase extends DistributedMahoutSuite with Matchers { this: FunSuite =>
-
- val epsilon = 1E-6
-
- val smallEpsilon = 1.0
-
- // FullRunningAverageAndStdDev tests
- test("testFullRunningAverageAndStdDev") {
- val average: RunningAverageAndStdDev = new FullRunningAverageAndStdDev
- assert(0 == average.getCount)
- assert(true == Double.isNaN(average.getAverage))
- assert(true == Double.isNaN(average.getStandardDeviation))
- average.addDatum(6.0)
- assert(1 == average.getCount)
- assert((6.0 - average.getAverage).abs < epsilon)
- assert(true == Double.isNaN(average.getStandardDeviation))
- average.addDatum(6.0)
- assert(2 == average.getCount)
- assert((6.0 - average.getAverage).abs < epsilon)
- assert((0.0 - average.getStandardDeviation).abs < epsilon)
- average.removeDatum(6.0)
- assert(1 == average.getCount)
- assert((6.0 - average.getAverage).abs < epsilon)
- assert(true == Double.isNaN(average.getStandardDeviation))
- average.addDatum(-4.0)
- assert(2 == average.getCount)
- assert((1.0 - average.getAverage).abs < epsilon)
- assert(((5.0 * 1.4142135623730951) - average.getStandardDeviation).abs < epsilon)
- average.removeDatum(4.0)
- assert(1 == average.getCount)
- assert((2.0 + average.getAverage).abs < epsilon)
- assert(true == Double.isNaN(average.getStandardDeviation))
- }
-
- test("testBigFullRunningAverageAndStdDev") {
- val average: RunningAverageAndStdDev = new FullRunningAverageAndStdDev
- RandomUtils.useTestSeed()
- val r: Random = RandomUtils.getRandom
-
- for (i <- 0 until 100000) {
- average.addDatum(r.nextDouble() * 1000.0)
- }
-
- assert((500.0 - average.getAverage).abs < smallEpsilon)
- assert(((1000.0 / Math.sqrt(12.0)) - average.getStandardDeviation).abs < smallEpsilon)
- }
-
- test("testStddevFullRunningAverageAndStdDev") {
- val runningAverage: RunningAverageAndStdDev = new FullRunningAverageAndStdDev
- assert(0 == runningAverage.getCount)
- assert(true == Double.isNaN(runningAverage.getAverage))
- runningAverage.addDatum(1.0)
- assert(1 == runningAverage.getCount)
- assert((1.0 - runningAverage.getAverage).abs < epsilon)
- assert(true == Double.isNaN(runningAverage.getStandardDeviation))
- runningAverage.addDatum(1.0)
- assert(2 == runningAverage.getCount)
- assert((1.0 - runningAverage.getAverage).abs < epsilon)
- assert((0.0 -runningAverage.getStandardDeviation).abs < epsilon)
- runningAverage.addDatum(7.0)
- assert(3 == runningAverage.getCount)
- assert((3.0 - runningAverage.getAverage).abs < epsilon)
- assert((3.464101552963257 - runningAverage.getStandardDeviation).abs < epsilon)
- runningAverage.addDatum(5.0)
- assert(4 == runningAverage.getCount)
- assert((3.5 - runningAverage.getAverage) < epsilon)
- assert((3.0- runningAverage.getStandardDeviation).abs < epsilon)
- }
-
-
-
- // FullRunningAverage tests
- test("testFullRunningAverage"){
- val runningAverage: RunningAverage = new FullRunningAverage
- assert(0 == runningAverage.getCount)
- assert(true == Double.isNaN(runningAverage.getAverage))
- runningAverage.addDatum(1.0)
- assert(1 == runningAverage.getCount)
- assert((1.0 - runningAverage.getAverage).abs < epsilon)
- runningAverage.addDatum(1.0)
- assert(2 == runningAverage.getCount)
- assert((1.0 - runningAverage.getAverage).abs < epsilon)
- runningAverage.addDatum(4.0)
- assert(3 == runningAverage.getCount)
- assert((2.0 - runningAverage.getAverage) < epsilon)
- runningAverage.addDatum(-4.0)
- assert(4 == runningAverage.getCount)
- assert((0.5 - runningAverage.getAverage).abs < epsilon)
- runningAverage.removeDatum(-4.0)
- assert(3 == runningAverage.getCount)
- assert((2.0 - runningAverage.getAverage).abs < epsilon)
- runningAverage.removeDatum(4.0)
- assert(2 == runningAverage.getCount)
- assert((1.0 - runningAverage.getAverage).abs < epsilon)
- runningAverage.changeDatum(0.0)
- assert(2 == runningAverage.getCount)
- assert((1.0 - runningAverage.getAverage).abs < epsilon)
- runningAverage.changeDatum(2.0)
- assert(2 == runningAverage.getCount)
- assert((2.0 - runningAverage.getAverage).abs < epsilon)
- }
-
-
- test("testFullRunningAveragCopyConstructor") {
- val runningAverage: RunningAverage = new FullRunningAverage
- runningAverage.addDatum(1.0)
- runningAverage.addDatum(1.0)
- assert(2 == runningAverage.getCount)
- assert(1.0 - runningAverage.getAverage < epsilon)
- val copy: RunningAverage = new FullRunningAverage(runningAverage.getCount, runningAverage.getAverage)
- assert(2 == copy.getCount)
- assert(1.0 - copy.getAverage < epsilon)
- }
-
-
-
- // Inverted Running Average tests
- test("testInvertedRunningAverage") {
- val avg: RunningAverage = new FullRunningAverage
- val inverted: RunningAverage = new InvertedRunningAverage(avg)
- assert(0 == inverted.getCount)
- avg.addDatum(1.0)
- assert(1 == inverted.getCount)
- assert((1.0 + inverted.getAverage).abs < epsilon) // inverted.getAverage == -1.0
- avg.addDatum(2.0)
- assert(2 == inverted.getCount)
- assert((1.5 + inverted.getAverage).abs < epsilon) // inverted.getAverage == -1.5
- }
-
- test ("testInvertedRunningAverageAndStdDev") {
- val avg: RunningAverageAndStdDev = new FullRunningAverageAndStdDev
- val inverted: RunningAverageAndStdDev = new InvertedRunningAverageAndStdDev(avg)
- assert(0 == inverted.getCount)
- avg.addDatum(1.0)
- assert(1 == inverted.getCount)
- assert(((1.0 + inverted.getAverage).abs < epsilon)) // inverted.getAverage == -1.0
- avg.addDatum(2.0)
- assert(2 == inverted.getCount)
- assert((1.5 + inverted.getAverage).abs < epsilon) // inverted.getAverage == -1.5
- assert(((Math.sqrt(2.0) / 2.0) - inverted.getStandardDeviation).abs < epsilon)
- }
-
-
- // confusion Matrix tests
- val VALUES: Array[Array[Int]] = Array(Array(2, 3), Array(10, 20))
- val LABELS: Array[String] = Array("Label1", "Label2")
- val OTHER: Array[Int] = Array(3, 6)
- val DEFAULT_LABEL: String = "other"
-
- def fillConfusionMatrix(values: Array[Array[Int]], labels: Array[String], defaultLabel: String): ConfusionMatrix = {
- val labelList = Arrays.asList(labels(0),labels(1))
- val confusionMatrix: ConfusionMatrix = new ConfusionMatrix(labelList, defaultLabel)
- confusionMatrix.putCount("Label1", "Label1", values(0)(0))
- confusionMatrix.putCount("Label1", "Label2", values(0)(1))
- confusionMatrix.putCount("Label2", "Label1", values(1)(0))
- confusionMatrix.putCount("Label2", "Label2", values(1)(1))
- confusionMatrix.putCount("Label1", DEFAULT_LABEL, OTHER(0))
- confusionMatrix.putCount("Label2", DEFAULT_LABEL, OTHER(1))
-
- confusionMatrix
- }
-
- private def checkAccuracy(cm: ConfusionMatrix) {
- val labelstrs = cm.getLabels
- assert(3 == labelstrs.size)
- assert((25.0 - cm.getAccuracy("Label1")).abs < epsilon)
- assert((55.5555555 - cm.getAccuracy("Label2")).abs < epsilon)
- assert(true == Double.isNaN(cm.getAccuracy("other")))
- }
-
- private def checkValues(cm: ConfusionMatrix) {
- val counts: Array[Array[Int]] = cm.getConfusionMatrix
- cm.toString
- assert(counts.length == counts(0).length)
- assert(3 == counts.length)
- assert(VALUES(0)(0) == counts(0)(0))
- assert(VALUES(0)(1) == counts(0)(1))
- assert(VALUES(1)(0) == counts(1)(0))
- assert(VALUES(1)(1) == counts(1)(1))
- assert(true == Arrays.equals(new Array[Int](3), counts(2)))
- assert(OTHER(0) == counts(0)(2))
- assert(OTHER(1) == counts(1)(2))
- assert(3 == cm.getLabels.size)
- assert(true == cm.getLabels.contains(LABELS(0)))
- assert(true == cm.getLabels.contains(LABELS(1)))
- assert(true == cm.getLabels.contains(DEFAULT_LABEL))
- }
-
- test("testBuild"){
- val confusionMatrix: ConfusionMatrix = fillConfusionMatrix(VALUES, LABELS, DEFAULT_LABEL)
- checkValues(confusionMatrix)
- checkAccuracy(confusionMatrix)
- }
-
- test("GetMatrix") {
- val confusionMatrix: ConfusionMatrix = fillConfusionMatrix(VALUES, LABELS, DEFAULT_LABEL)
- val m: Matrix = confusionMatrix.getMatrix
- val rowLabels = m.getRowLabelBindings
- assert(confusionMatrix.getLabels.size == m.numCols)
- assert(true == rowLabels.keySet.contains(LABELS(0)))
- assert(true == rowLabels.keySet.contains(LABELS(1)))
- assert(true == rowLabels.keySet.contains(DEFAULT_LABEL))
- assert(2 == confusionMatrix.getCorrect(LABELS(0)))
- assert(20 == confusionMatrix.getCorrect(LABELS(1)))
- assert(0 == confusionMatrix.getCorrect(DEFAULT_LABEL))
- }
-
- /**
- * Example taken from
- * http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
- */
- test("testPrecisionRecallAndF1ScoreAsScikitLearn") {
- val labelList = Arrays.asList("0", "1", "2")
- val confusionMatrix: ConfusionMatrix = new ConfusionMatrix(labelList, "DEFAULT")
- confusionMatrix.putCount("0", "0", 2)
- confusionMatrix.putCount("1", "0", 1)
- confusionMatrix.putCount("1", "2", 1)
- confusionMatrix.putCount("2", "1", 2)
- val delta: Double = 0.001
- assert((0.222 - confusionMatrix.getWeightedPrecision).abs < delta)
- assert((0.333 - confusionMatrix.getWeightedRecall).abs < delta)
- assert((0.266 - confusionMatrix.getWeightedF1score).abs < delta)
- }
-
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala
deleted file mode 100644
index 70fb10f..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-import org.apache.mahout.math.algorithms.preprocessing._
-import org.apache.mahout.math.drm.drmParallelize
-import org.apache.mahout.math.scalabindings.{dense, sparse, svec}
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-
-import org.apache.mahout.test.DistributedMahoutSuite
-
-trait ClusteringSuiteBase extends DistributedMahoutSuite with Matchers {
-
- this: FunSuite =>
-
- test("canopy test") {
- val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2, -5.2, 5.3), (7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0)))
-
- import org.apache.mahout.math.algorithms.clustering.CanopyClustering
-
- val model = new CanopyClustering().fit(drmA, 't1 -> 6.5, 't2 -> 5.5, 'distanceMeasure -> 'Chebyshev)
- val myAnswer = model.cluster(drmA).collect
-
- val correctAnswer = dense((0.0), (0.0), (1.0), (0.0), (2.0))
-
- val epsilon = 1E-6
- (myAnswer.norm - correctAnswer.norm) should be <= epsilon
- }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala
deleted file mode 100644
index ffe1d1b..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-import org.apache.mahout.math.algorithms.preprocessing._
-import org.apache.mahout.math.drm.drmParallelize
-import org.apache.mahout.math.scalabindings.{dense, sparse, svec}
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-
-trait PreprocessorSuiteBase extends DistributedMahoutSuite with Matchers {
-
- this: FunSuite =>
-
- test("asfactor test") {
- val A = drmParallelize(dense(
- (3, 2, 1, 2),
- (0, 0, 0, 0),
- (1, 1, 1, 1)), numPartitions = 2)
-
- // 0 -> 2, 3 -> 5, 6 -> 9
- val factorizer: AsFactorModel = new AsFactor().fit(A)
-
- val factoredA = factorizer.transform(A)
-
- println(factoredA)
- println(factorizer.factorMap)
- val correctAnswer = sparse(
- svec((3 → 1.0) :: (6 → 1.0) :: (8 → 1.0) :: (11 → 1.0) :: Nil, cardinality = 12),
- svec((0 → 1.0) :: (4 → 1.0) :: (7 → 1.0) :: ( 9 → 1.0) :: Nil, cardinality = 12),
- svec((1 → 1.0) :: (5 → 1.0) :: (8 → 1.0) :: (10 → 1.0) :: Nil, cardinality = 12)
- )
-
- val myAnswer = factoredA.collect
-
- val epsilon = 1E-6
- (myAnswer.norm - correctAnswer.norm) should be <= epsilon
- (myAnswer.norm - correctAnswer.norm) should be <= epsilon
-
- }
-
- test("standard scaler test") {
- /**
- * R Prototype
- * x <- matrix( c(1,2,3,1,5,9,5,-15,-2), nrow=3)
- * scale(x, scale= apply(x, 2, sd) * sqrt(2/3))
- * # ^^ note: R uses degress of freedom = 1 for standard deviation calculations.
- * # we don't (and neither does sklearn)
- * # the *sqrt(N-1/N) 'undoes' the degrees of freedom = 1
- */
-
- val A = drmParallelize(dense(
- (1, 1, 5),
- (2, 5, -15),
- (3, 9, -2)), numPartitions = 2)
-
- val scaler: StandardScalerModel = new StandardScaler().fit(A)
-
- val correctAnswer = dense(
- (-1.224745, -1.224745, -1.224745),
- (0.000000, 0.000000, 1.224745),
- (1.224745, 1.224745, 0.000000))
-
- val myAnswer = scaler.transform(A).collect
- println(scaler.meanVec)
- println(scaler.stdev)
-
- val epsilon = 1E-6
- (myAnswer.norm - correctAnswer.norm) should be <= epsilon
-
- }
-
- test("mean center test") {
- /**
- * R Prototype
- *
- * x <- matrix( c(1.0,2.0,3.0,1.0,5.0,9.0,-2.0,2.0,0), nrow=3)
- * centered.x <- scale(x, scale= FALSE)
- * print(centered.x)
- */
-
-
- val A = drmParallelize(dense(
- (1, 1, -2),
- (2, 5, 2),
- (3, 9, 0)), numPartitions = 2)
-
- val scaler: MeanCenterModel = new MeanCenter().fit(A)
-
- val myAnswer = scaler.transform(A).collect
-
- val correctAnswer = dense(
- (-1, -4, -2),
- (0, 0, 2),
- (1, 4, 0))
-
- val epsilon = 1E-6
- (myAnswer.norm - correctAnswer.norm) should be <= epsilon
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
deleted file mode 100644
index 8910ae9..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-import org.apache.mahout.math.algorithms.regression._
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.scalabindings._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-
-trait RegressionSuiteBase extends DistributedMahoutSuite with Matchers {
- this: FunSuite =>
-
- val epsilon = 1E-6
-
- test("ordinary least squares") {
- /*
- R Prototype:
- dataM <- matrix( c(2, 2, 10.5, 10, 29.509541,
- 1, 2, 12, 12, 18.042851,
- 1, 1, 12, 13, 22.736446,
- 2, 1, 11, 13, 32.207582,
- 1, 2, 12, 11, 21.871292,
- 2, 1, 16, 8, 36.187559,
- 6, 2, 17, 1, 50.764999,
- 3, 2, 13, 7, 40.400208,
- 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE)
-
-
- X = dataM[, c(1,2,3,4)]
- y = dataM[, c(5)]
-
- model <- lm(y ~ X )
- summary(model)
-
- */
-
- val drmData = drmParallelize(dense(
- (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios
- (1, 2, 12, 12, 18.042851), // Cap'n'Crunch
- (1, 1, 12, 13, 22.736446), // Cocoa Puffs
- (2, 1, 11, 13, 32.207582), // Froot Loops
- (1, 2, 12, 11, 21.871292), // Honey Graham Ohs
- (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold
- (6, 2, 17, 1, 50.764999), // Cheerios
- (3, 2, 13, 7, 40.400208), // Clusters
- (3, 3, 13, 4, 45.811716)), numPartitions = 2)
-
-
- val drmX = drmData(::, 0 until 4)
- val drmY = drmData(::, 4 until 5)
-
- val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY, 'calcCommonStatistics → false)
-
- val estimate = model.beta
- val Ranswers = dvec(-1.336265, -13.157702, -4.152654, -5.679908, 163.179329)
-
- val epsilon = 1E-6
- (estimate - Ranswers).sum should be < epsilon
-
- // TODO add test for S.E / pvalue
- }
-
- test("cochrane-orcutt"){
- /* R Prototype:
- library(orcutt)
-
- df = data.frame(t(data.frame(
- c(20.96, 127.3),
- c(21.40, 130.0),
- c(21.96, 132.7),
- c(21.52, 129.4),
- c(22.39, 135.0),
- c(22.76, 137.1),
- c(23.48, 141.2),
- c(23.66, 142.8),
- c(24.10, 145.5),
- c(24.01, 145.3),
- c(24.54, 148.3),
- c(24.30, 146.4),
- c(25.00, 150.2),
- c(25.64, 153.1),
- c(26.36, 157.3),
- c(26.98, 160.7),
- c(27.52, 164.2),
- c(27.78, 165.6),
- c(28.24, 168.7),
- c(28.78, 171.7))))
-
- rownames(df) <- NULL
- colnames(df) <- c("y", "x")
- my_lm = lm(y ~ x, data=df)
- coch = cochrane.orcutt(my_lm)
-
- ///////////////////////////////////////
- The R-implementation is kind of...silly.
-
- The above works- converges at 318 iterations- the transformed DW is 1.72, yet the rho is
- .95882. After 318 iteartions, this will also report a rho of .95882 (which sugguests SEVERE
- autocorrelation- nothing close to 1.72.
-
- At anyrate, the real prototype for this is the example from Applied Linear Statistcal Models
- 5th Edition by Kunter, Nachstheim, Neter, and Li. They also provide some interesting notes on p 494:
- 1) "Cochrane-Orcutt does not always work properly. A major reason is that when the error terms
- are positively autocorrelated, the estimate r in (12.22) tends to underestimate the autocorrelation
- parameter rho. When this bias is serious, it can significantly reduce the effectiveness of the
- Cochrane-Orcutt approach.
- 2. There exists an approximate relation between the Durbin Watson test statistic D in (12.14)
- and the estimated autocorrelation paramater r in (12.22):
- D ~= 2(1-r)"
-
- They also note on p492:
- "... If the process does not terminate after one or two iterations, a different procedure
- should be employed."
- This differs from the logic found elsewhere, and the method presented in R where, in the simple
- example in the prototype, the procedure runs for 318 iterations. This is why the default
- maximum iteratoins are 3, and should be left as such.
-
- Also, the prototype and 'correct answers' are based on the example presented in Kunter et. al on
- p492-4 (including dataset).
-
- */
-
- val alsmBlaisdellCo = drmParallelize( dense(
- (20.96, 127.3),
- (21.40, 130.0),
- (21.96, 132.7),
- (21.52, 129.4),
- (22.39, 135.0),
- (22.76, 137.1),
- (23.48, 141.2),
- (23.66, 142.8),
- (24.10, 145.5),
- (24.01, 145.3),
- (24.54, 148.3),
- (24.30, 146.4),
- (25.00, 150.2),
- (25.64, 153.1),
- (26.36, 157.3),
- (26.98, 160.7),
- (27.52, 164.2),
- (27.78, 165.6),
- (28.24, 168.7),
- (28.78, 171.7) ))
-
- val drmY = alsmBlaisdellCo(::, 0 until 1)
- val drmX = alsmBlaisdellCo(::, 1 until 2)
-
- var coModel = new CochraneOrcutt[Int]().fit(drmX, drmY , ('iterations -> 2))
- val coResiduals = drmY - coModel.predict(drmX)
-
- val correctRho = 0.631166
- (coModel.rhos(1) - correctRho) should be < epsilon
-
- val shortEpsilon = 1E-4 // book rounded off pretty short
- val correctBeta = dvec(0.17376, -1.0685)
- (coModel.betas(1) - correctBeta).sum.abs < shortEpsilon
-
- val correctSe = dvec(0.002957, 0.45332)
- (coModel.se - correctSe).sum.abs < shortEpsilon
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
deleted file mode 100644
index 57dffef..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares
-import org.apache.mahout.math.algorithms.regression.tests._
-import org.apache.mahout.math.drm.{CheckpointedDrm, drmParallelize}
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.scalabindings.{`::`, dense}
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-
-
-trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers {
- this: FunSuite =>
-
- val epsilon = 1E-4
-
- test("fittness tests") {
- /*
- R Prototype:
- dataM <- matrix( c(2, 2, 10.5, 10, 29.509541,
- 1, 2, 12, 12, 18.042851,
- 1, 1, 12, 13, 22.736446,
- 2, 1, 11, 13, 32.207582,
- 1, 2, 12, 11, 21.871292,
- 2, 1, 16, 8, 36.187559,
- 6, 2, 17, 1, 50.764999,
- 3, 2, 13, 7, 40.400208,
- 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE)
-
-
- X = dataM[, c(1,2,3,4)]
- y = dataM[, c(5)]
-
- model <- lm(y ~ X)
- summary(model)
-
- */
-
- val drmData = drmParallelize(dense(
- (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios
- (1, 2, 12, 12, 18.042851), // Cap'n'Crunch
- (1, 1, 12, 13, 22.736446), // Cocoa Puffs
- (2, 1, 11, 13, 32.207582), // Froot Loops
- (1, 2, 12, 11, 21.871292), // Honey Graham Ohs
- (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold
- (6, 2, 17, 1, 50.764999), // Cheerios
- (3, 2, 13, 7, 40.400208), // Clusters
- (3, 3, 13, 4, 45.811716)), numPartitions = 2)
-
- val drmX = drmData(::, 0 until 4)
- val drmY = drmData(::, 4 until 5)
-
- val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY)
-
- println(model.summary)
- // Answers from running similar algorithm in R
- val rR2 = 0.9425
- val rMSE = 6.457157
-
- val r2: Double = model.r2
- val mse: Double = model.mse
- (rR2 - r2) should be < epsilon
- (rMSE - mse) should be < epsilon
-
- Math.abs(model.beta.get(4) - 163.17933 ) should be < epsilon
- Math.abs(model.beta.get(0) - (-1.33627) ) should be < epsilon
- Math.abs(model.beta.get(1) - (-13.15770)) should be < epsilon
- Math.abs(model.beta.get(2) - (-4.15265) ) should be < epsilon
- Math.abs(model.beta.get(3) - (-5.679908)) should be < epsilon
-
- Math.abs(model.tScore.get(0) - (-0.49715717)) should be < epsilon
- Math.abs(model.tScore.get(1) - (-2.43932888)) should be < epsilon
- Math.abs(model.tScore.get(2) - (-2.32654000)) should be < epsilon
- Math.abs(model.tScore.get(3) - (-3.01022444)) should be < epsilon
- Math.abs(model.tScore.get(4) - 3.143183937 ) should be < epsilon
-
- model.degreesOfFreedom should equal(5)
- model.trainingExamples should equal(9)
-
- Math.abs((model.fScore - 16.38542361)) should be < 0.0000001
-
- }
-
- test("durbinWatsonTest test") {
- /**
- * R Prototype
- *
- * library(car)
- * residuals <- seq(0, 4.9, 0.1)
- * ## perform Durbin-Watson test
- * durbinWatsonTest(residuals)
- */
-
- val correctAnswer = 0.001212121
- val err1 = drmParallelize( dense((0.0 until 5.0 by 0.1).toArray) ).t
- val drmX = drmParallelize( dense((0 until 50).toArray.map( t => Math.pow(-1.0, t)) ) ).t
- val drmY = drmX + err1 + 1
- var model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY)
- val syntheticResiduals = err1
- model = AutocorrelationTests.DurbinWatson(model, syntheticResiduals)
- val myAnswer: Double = model.testResults.getOrElse('durbinWatsonTestStatistic, -1.0).asInstanceOf[Double]
- (myAnswer - correctAnswer) should be < epsilon
- }
-
-
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala
deleted file mode 100644
index ba6e145..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala
+++ /dev/null
@@ -1,59 +0,0 @@
-package org.apache.mahout.math.backend
-
-import org.apache.mahout.math.backend.jvm.JvmBackend
-import org.scalatest.{FunSuite, Matchers}
-
-import scala.collection.mutable
-import scala.reflect.{ClassTag, classTag}
-
-class BackendSuite extends FunSuite with Matchers {
-
- test("GenericBackend") {
-
- trait MySolverTrait1 { def myMethod1 = Unit }
-
-
- trait MySolverTrait2
-
- class MySolverImpl1 extends MySolverTrait1 {
- }
-
- class MySolverImpl2 extends MySolverTrait2
-
- // My dummy backend supporting to trait solvers filled with 2 dummy implementations of these
- // traits should be able to serve based on their solver traits.
- val myBackend = new Backend {
-
- override def isAvailable: Boolean = true
-
- override val solverMap = new mutable.HashMap[ClassTag[_], Any]()
-
- solverMap ++= Map(
- classTag[MySolverTrait1] → new MySolverImpl1,
- classTag[MySolverTrait2] → new MySolverImpl2
- )
-
- validateMap()
- }
-
- myBackend.getSolver shouldBe None
-
- val mySolver1 = myBackend.getSolver[MySolverTrait1]
-
- // This is indeed solver1 trait type:
- mySolver1.get.myMethod1
- mySolver1.get.isInstanceOf[MySolverImpl1] shouldBe true
-
- // Validator should not allow non-subclasses in implementation.
- an [IllegalArgumentException] mustBe thrownBy {
- myBackend.solverMap(classTag[MySolverTrait2]) = 0
- myBackend.validateMap()
- }
- }
-
- test("JvmBackend") {
- // Just create JVM backend and validate.
- JvmBackend.validateMap()
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala
deleted file mode 100644
index 8f5ec99..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decompositions
-
-import org.scalatest.FunSuite
-import org.apache.mahout.test.MahoutSuite
-import org.apache.mahout.common.RandomUtils
-import org.apache.mahout.math._
-import scalabindings._
-import RLikeOps._
-
-/**
- * This suite tests only in-core decomposititions.
- * 
- *
- * We moved distributed tests into mahout-spark module since they require a concrete distributed
- * engine dependencies to run.
- * 
- */
-class DecompositionsSuite extends FunSuite with MahoutSuite {
-
- test("ssvd") {
-
- // Very naive, a full-rank only here.
- val a = dense(
- (1, 2, 3),
- (3, 4, 5),
- (-2, 6, 7),
- (-3, 8, 9)
- )
-
- val rank = 2
- val (u, v, s) = ssvd(a, k = rank, q = 1)
-
- val (uControl, vControl, sControl) = svd(a)
-
- printf("U:\n%s\n", u)
- printf("U-control:\n%s\n", uControl)
- printf("V:\n%s\n", v)
- printf("V-control:\n%s\n", vControl)
- printf("Sigma:\n%s\n", s)
- printf("Sigma-control:\n%s\n", sControl)
-
- (s - sControl(0 until rank)).norm(2) should be < 1E-7
-
- // Singular vectors may be equivalent down to a sign only.
- (u.norm - uControl(::, 0 until rank).norm).abs should be < 1E-7
- (v.norm - vControl(::, 0 until rank).norm).abs should be < 1E-7
- }
-
- test("spca") {
-
- import math._
-
- val rnd = RandomUtils.getRandom
-
- // Number of points
- val m = 500
- // Length of actual spectrum
- val spectrumLen = 40
-
- val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3))
- printf("spectrum:%s\n", spectrum)
-
- val (u, _) = qr(new SparseRowMatrix(m, spectrumLen) :=
- ((r, c, v) => if (rnd.nextDouble() < 0.2) 0 else rnd.nextDouble() + 5.0))
-
- // PCA Rotation matrix -- should also be orthonormal.
- val (tr, _) = qr(Matrices.symmetricUniformView(spectrumLen, spectrumLen, rnd.nextInt) - 10.0)
-
- val input = (u %*%: diagv(spectrum)) %*% tr.t
-
- // Calculate just first 10 principal factors and reduce dimensionality.
- // Since we assert just validity of the s-pca, not stochastic error, we bump p parameter to
- // ensure to zero stochastic error and assert only functional correctness of the method's pca-
- // specific additions.
- val k = 10
- var (pca, _, s) = spca(a = input, k = k, p = spectrumLen, q = 1)
- printf("Svs:%s\n", s)
- // Un-normalized pca data:
- pca = pca %*%: diagv(s)
-
- // Of course, once we calculated the pca, the spectrum is going to be different since our originally
- // generated input was not centered. So here, we'd just brute-solve pca to verify
- val xi = input.colMeans()
- for (r <- 0 until input.nrow) input(r, ::) -= xi
- var (pcaControl, _, sControl) = svd(m = input)
-
- printf("Svs-control:%s\n", sControl)
- pcaControl = (pcaControl %*%: diagv(sControl))(::, 0 until k)
-
- printf("pca:\n%s\n", pca(0 until 10, 0 until 10))
- printf("pcaControl:\n%s\n", pcaControl(0 until 10, 0 until 10))
-
- (pca(0 until 10, 0 until 10).norm - pcaControl(0 until 10, 0 until 10).norm).abs should be < 1E-5
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala
deleted file mode 100644
index de8228e..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decompositions
-
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.apache.mahout.math._
-import scalabindings._
-import RLikeOps._
-import drm._
-import RLikeDrmOps._
-import org.scalatest.{FunSuite, Matchers}
-import org.apache.mahout.common.RandomUtils
-import math._
-
-/**
- * ==Common distributed code to run against each distributed engine support.==
- *
- * Each distributed engine's decompositions package should have a suite that includes this feature
- * as part of its distributed test suite.
- *
- */
-trait DistributedDecompositionsSuiteBase extends DistributedMahoutSuite with Matchers { this:FunSuite =>
-
-
- test("thin distributed qr") {
-
- val inCoreA = dense(
- (1, 2, 3, 4),
- (2, 3, 4, 5),
- (3, -4, 5, 6),
- (4, 5, 6, 7),
- (8, 6, 7, 8)
- )
-
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
- val (drmQ, inCoreR) = dqrThin(drmA, checkRankDeficiency = false)
-
- // Assert optimizer still knows Q and A are identically partitioned
- drmQ.partitioningTag should equal(drmA.partitioningTag)
-
-// drmQ.rdd.partitions.size should be(A.rdd.partitions.size)
-//
-// // Should also be zippable
-// drmQ.rdd.zip(other = A.rdd)
-
- val inCoreQ = drmQ.collect
-
- printf("A=\n%s\n", inCoreA)
- printf("Q=\n%s\n", inCoreQ)
- printf("R=\n%s\n", inCoreR)
-
- val (qControl, rControl) = qr(inCoreA)
- printf("qControl=\n%s\n", qControl)
- printf("rControl=\n%s\n", rControl)
-
- // Validate with Cholesky
- val ch = chol(inCoreA.t %*% inCoreA)
- printf("A'A=\n%s\n", inCoreA.t %*% inCoreA)
- printf("L:\n%s\n", ch.getL)
-
- val rControl2 = (ch.getL cloned).t
- val qControl2 = ch.solveRight(inCoreA)
- printf("qControl2=\n%s\n", qControl2)
- printf("rControl2=\n%s\n", rControl2)
-
- // Householder approach seems to be a little bit more stable
- (rControl - inCoreR).norm should be < 1E-5
- (qControl - inCoreQ).norm should be < 1E-5
-
- // Assert identicity with in-core Cholesky-based -- this should be tighter.
- (rControl2 - inCoreR).norm should be < 1E-10
- (qControl2 - inCoreQ).norm should be < 1E-10
-
- // Assert orthogonality:
- // (a) Q[,j] dot Q[,j] == 1.0 for all j
- // (b) Q[,i] dot Q[,j] == 0.0 for all i != j
- for (col <- 0 until inCoreQ.ncol)
- ((inCoreQ(::, col) dot inCoreQ(::, col)) - 1.0).abs should be < 1e-10
- for (col1 <- 0 until inCoreQ.ncol - 1; col2 <- col1 + 1 until inCoreQ.ncol)
- (inCoreQ(::, col1) dot inCoreQ(::, col2)).abs should be < 1e-10
-
-
- }
-
- test("dssvd - the naive-est - q=0") {
- dssvdNaive(q = 0)
- }
-
- test("ddsvd - naive - q=1") {
- dssvdNaive(q = 1)
- }
-
- test("ddsvd - naive - q=2") {
- dssvdNaive(q = 2)
- }
-
-
- def dssvdNaive(q: Int) {
- val inCoreA = dense(
- (1, 2, 3, 4),
- (2, 3, 4, 5),
- (3, -4, 5, 6),
- (4, 5, 6, 7),
- (8, 6, 7, 8)
- )
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- val (drmU, drmV, s) = dssvd(drmA, k = 4, q = q)
- val (inCoreU, inCoreV) = (drmU.collect, drmV.collect)
-
- printf("U:\n%s\n", inCoreU)
- printf("V:\n%s\n", inCoreV)
- printf("Sigma:\n%s\n", s)
-
- (inCoreA - (inCoreU %*%: diagv(s)) %*% inCoreV.t).norm should be < 1E-5
- }
-
- test("dspca") {
-
- val rnd = RandomUtils.getRandom
-
- // Number of points
- val m = 500
- // Length of actual spectrum
- val spectrumLen = 40
-
- val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3))
- printf("spectrum:%s\n", spectrum)
-
- val (u, _) = qr(new SparseRowMatrix(m, spectrumLen) :=
- ((r, c, v) => if (rnd.nextDouble() < 0.2) 0 else rnd.nextDouble() + 5.0))
-
- // PCA Rotation matrix -- should also be orthonormal.
- val (tr, _) = qr(Matrices.symmetricUniformView(spectrumLen, spectrumLen, rnd.nextInt) - 10.0)
-
- val input = (u %*%: diagv(spectrum)) %*% tr.t
- val drmInput = drmParallelize(m = input, numPartitions = 2)
-
- // Calculate just first 10 principal factors and reduce dimensionality.
- // Since we assert just validity of the s-pca, not stochastic error, we bump p parameter to
- // ensure to zero stochastic error and assert only functional correctness of the method's pca-
- // specific additions.
- val k = 10
-
- // Calculate just first 10 principal factors and reduce dimensionality.
- var (drmPCA, _, s) = dspca(drmA = drmInput, k = 10, p = spectrumLen, q = 1)
- // Un-normalized pca data:
- drmPCA = drmPCA %*% diagv(s)
-
- val pca = drmPCA.checkpoint(CacheHint.NONE).collect
-
- // Of course, once we calculated the pca, the spectrum is going to be different since our originally
- // generated input was not centered. So here, we'd just brute-solve pca to verify
- val xi = input.colMeans()
- for (r <- 0 until input.nrow) input(r, ::) -= xi
- var (pcaControl, _, sControl) = svd(m = input)
- pcaControl = (pcaControl %*%: diagv(sControl))(::, 0 until k)
-
- printf("pca:\n%s\n", pca(0 until 10, 0 until 10))
- printf("pcaControl:\n%s\n", pcaControl(0 until 10, 0 until 10))
-
- (pca(0 until 10, 0 until 10).norm - pcaControl(0 until 10, 0 until 10).norm).abs should be < 1E-5
-
- }
-
- test("dals") {
-
- val rnd = RandomUtils.getRandom
-
- // Number of points
- val m = 500
- val n = 500
-
- // Length of actual spectrum
- val spectrumLen = 40
-
- // Create singluar values with decay
- val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3))
- printf("spectrum:%s\n", spectrum)
-
- // Create A as an ideal input
- val inCoreA = (qr(Matrices.symmetricUniformView(m, spectrumLen, 1234))._1 %*%: diagv(spectrum)) %*%
- qr(Matrices.symmetricUniformView(n, spectrumLen, 2345))._1.t
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- // Decompose using ALS
- val (drmU, drmV, rmse) = dals(drmA = drmA, k = 20).toTuple
- val inCoreU = drmU.collect
- val inCoreV = drmV.collect
-
- val predict = inCoreU %*% inCoreV.t
-
- printf("Control block:\n%s\n", inCoreA(0 until 3, 0 until 3))
- printf("ALS factorized approximation block:\n%s\n", predict(0 until 3, 0 until 3))
-
- val err = (inCoreA - predict).norm
- printf("norm of residuals %f\n", err)
- printf("train iteration rmses: %s\n", rmse)
-
- err should be < 15e-2
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
deleted file mode 100644
index 525da11..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-import org.apache.mahout.math._
-import scalabindings._
-import RLikeOps._
-import RLikeDrmOps._
-
-import scala.reflect.{ClassTag,classTag}
-
-/** Common tests for DrmLike operators to be executed by all distributed engines. */
-trait DrmLikeOpsSuiteBase extends DistributedMahoutSuite with Matchers {
- this: FunSuite ⇒
-
- test("mapBlock") {
-
- val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
- val A = drmParallelize(m = inCoreA, numPartitions = 2)
- val B = A.mapBlock(/* Inherit width */) {
- case (keys, block) ⇒ keys → (block += 1.0)
- }
-
- val inCoreB = B.collect
- val inCoreBControl = inCoreA + 1.0
-
- println(inCoreB)
-
- // Assert they are the same
- (inCoreB - inCoreBControl).norm should be < 1E-10
- B.keyClassTag shouldBe ClassTag.Int
-
- }
-
- test ("mapBlock implicit keying") {
-
- val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
- val A = drmParallelize(m = inCoreA, numPartitions = 2)
- val B = A.mapBlock(/* Inherit width */) {
- case (keys, block) ⇒ keys.map { k ⇒ k.toString } → block
- }
-
- B.keyClassTag shouldBe classTag[String]
-
- }
-
-
- test("allReduceBlock") {
-
- val mxA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
- val drmA = drmParallelize(mxA, numPartitions = 2)
-
- try {
- val mxB = drmA.allreduceBlock { case (keys, block) ⇒
- block(::, 0 until 2).t %*% block(::, 2 until 3)
- }
-
- val mxControl = mxA(::, 0 until 2).t %*% mxA(::, 2 until 3)
-
- (mxB - mxControl).norm should be < 1e-10
-
- } catch {
- case e: UnsupportedOperationException ⇒ // Some engines may not support this, so ignore.
- }
-
- }
-
- test("col range") {
- val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
- val A = drmParallelize(m = inCoreA, numPartitions = 2)
- val B = A(::, 1 to 2)
- val inCoreB = B.collect
- val inCoreBControl = inCoreA(::, 1 to 2)
-
- println(inCoreB)
-
- // Assert they are the same
- (inCoreB - inCoreBControl).norm should be < 1E-10
-
- }
-
- test("row range") {
-
- val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
- val A = drmParallelize(m = inCoreA, numPartitions = 2)
- val B = A(1 to 2, ::)
- val inCoreB = B.collect
- val inCoreBControl = inCoreA(1 to 2, ::)
-
- println(inCoreB)
-
- // Assert they are the same
- (inCoreB - inCoreBControl).norm should be < 1E-10
-
- }
-
- test("col, row range") {
-
- val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
- val A = drmParallelize(m = inCoreA, numPartitions = 2)
- val B = A(1 to 2, 1 to 2)
- val inCoreB = B.collect
- val inCoreBControl = inCoreA(1 to 2, 1 to 2)
-
- println(inCoreB)
-
- // Assert they are the same
- (inCoreB - inCoreBControl).norm should be < 1E-10
-
- }
-
- test("dsqDist(X,Y)") {
- val m = 100
- val n = 300
- val d = 7
- val mxX = Matrices.symmetricUniformView(m, d, 12345).cloned -= 5
- val mxY = Matrices.symmetricUniformView(n, d, 1234).cloned += 10
- val (drmX, drmY) = (drmParallelize(mxX, 3), drmParallelize(mxY, 4))
-
- val mxDsq = dsqDist(drmX, drmY).collect
- val mxDsqControl = new DenseMatrix(m, n) := { (r, c, _) ⇒ (mxX(r, ::) - mxY(c, ::)) ^= 2 sum }
- (mxDsq - mxDsqControl).norm should be < 1e-7
- }
-
- test("dsqDist(X)") {
- val m = 100
- val d = 7
- val mxX = Matrices.symmetricUniformView(m, d, 12345).cloned -= 5
- val drmX = drmParallelize(mxX, 3)
-
- val mxDsq = dsqDist(drmX).collect
- val mxDsqControl = sqDist(drmX)
- (mxDsq - mxDsqControl).norm should be < 1e-7
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
deleted file mode 100644
index 41814d8..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-import org.apache.mahout.math._
-import scalabindings._
-import RLikeOps._
-import scala.reflect.ClassTag
-
-/** Common DRM tests to be run by all distributed engines. */
-trait DrmLikeSuiteBase extends DistributedMahoutSuite with Matchers {
- this: FunSuite =>
-
- test("DRM DFS i/o (local)") {
-
- val uploadPath = TmpDir + "UploadedDRM"
-
- val inCoreA = dense((1, 2, 3), (3, 4, 5))
- val drmA = drmParallelize(inCoreA)
-
- drmA.dfsWrite(path = uploadPath)
-
- println(inCoreA)
-
- // Load back from hdfs
- val drmB = drmDfsRead(path = uploadPath)
-
- // Make sure keys are correctly identified as ints
- drmB.checkpoint(CacheHint.NONE).keyClassTag shouldBe ClassTag.Int
-
- // Collect back into in-core
- val inCoreB = drmB.collect
-
- // Print out to see what it is we collected:
- println(inCoreB)
-
- (inCoreA - inCoreB).norm should be < 1e-7
- }
-
- test("DRM parallelizeEmpty") {
-
- val drmEmpty = drmParallelizeEmpty(100, 50)
-
- // collect back into in-core
- val inCoreEmpty = drmEmpty.collect
-
- inCoreEmpty.sum.abs should be < 1e-7
- drmEmpty.nrow shouldBe 100
- drmEmpty.ncol shouldBe 50
- inCoreEmpty.nrow shouldBe 100
- inCoreEmpty.ncol shouldBe 50
-
- }
-
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
deleted file mode 100644
index 5d6d142..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
+++ /dev/null
@@ -1,655 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import org.apache.mahout.test.DistributedMahoutSuite
-import org.scalatest.{FunSuite, Matchers}
-import org.apache.mahout.math._
-import scalabindings._
-import RLikeOps._
-import RLikeDrmOps._
-import decompositions._
-import org.apache.mahout.math.drm.logical._
-import org.apache.mahout.math.drm.logical.OpAtx
-import org.apache.mahout.math.drm.logical.OpAtB
-import org.apache.mahout.math.drm.logical.OpAtA
-import org.apache.mahout.math.drm.logical.OpAewUnaryFuncFusion
-
-import scala.util.Random
-
-/** Common engine tests for distributed R-like DRM operations */
-trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
- this: FunSuite =>
-
- val epsilon = 1E-5
-
- test("A.t") {
-
- val inCoreA = dense((1, 2, 3), (3, 4, 5))
-
- val A = drmParallelize(inCoreA)
-
- val inCoreAt = A.t.collect
-
- // Assert first norm of difference is less than error margin.
- (inCoreAt - inCoreA.t).norm should be < epsilon
-
- }
-
- test("C = A %*% B") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- val B = drmParallelize(inCoreB, numPartitions = 2)
-
- // Actual
- val inCoreCControl = inCoreA %*% inCoreB
-
- // Distributed operation
- val C = A %*% B
- val inCoreC = C.collect
- println(inCoreC)
-
- (inCoreC - inCoreCControl).norm should be < 1E-10
-
- // We also should be able to collect via implicit checkpoint
- val inCoreC2 = C.collect
- println(inCoreC2)
-
- (inCoreC2 - inCoreCControl).norm should be < 1E-10
-
- }
-
- test("C = A %*% B mapBlock {}") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
-
- val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint()
- val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint()
-
- // Actual
- val inCoreCControl = inCoreA %*% inCoreB
-
- A.colSums()
- B.colSums()
-
-
- val x = drmBroadcast(dvec(0, 0))
- val x2 = drmBroadcast(dvec(0, 0))
- // Distributed operation
- val C = (B.t %*% A.t).t.mapBlock() {
- case (keys, block) =>
- for (row <- 0 until block.nrow) block(row, ::) += x.value + x2
- keys -> block
- }
-
- val inCoreC = C checkpoint CacheHint.NONE collect;
- println(inCoreC)
-
- (inCoreC - inCoreCControl).norm should be < 1E-10
-
- // We also should be able to collect via implicit checkpoint
- val inCoreC2 = C.collect
- println(inCoreC2)
-
- (inCoreC2 - inCoreCControl).norm should be < 1E-10
-
- val inCoreQ = dqrThin(C)._1.collect
-
- printf("Q=\n%s\n", inCoreQ)
-
- // Assert unit-orthogonality
- ((inCoreQ(::, 0) dot inCoreQ(::, 0)) - 1.0).abs should be < 1e-10
- (inCoreQ(::, 0) dot inCoreQ(::, 1)).abs should be < 1e-10
-
- }
-
- test("C = A %*% B incompatible B keys") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- val B = drmParallelize(inCoreB, numPartitions = 2)
- // Re-key B into DrmLike[String] instead of [Int]
- .mapBlock()({
- case (keys, block) => keys.map(_.toString) -> block
- })
-
- val C = A %*% B
-
- intercept[IllegalArgumentException] {
- // This plan must not compile
- C.checkpoint()
- }
- }
-
- test("Spark-specific C = At %*% B , join") {
-
- val inCoreA = dense((1, 2), (3, 4), (-3, -5))
- val inCoreB = dense((3, 5), (4, 6), (0, 1))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- val B = drmParallelize(inCoreB, numPartitions = 2)
-
- val C = A.t %*% B
-
- mahoutCtx.optimizerRewrite(C) should equal(OpAtB[Int](A, B))
-
- val inCoreC = C.collect
- val inCoreControlC = inCoreA.t %*% inCoreB
-
- (inCoreC - inCoreControlC).norm should be < 1E-10
-
- }
-
-
- test("C = At %*% B , join, String-keyed") {
-
- val inCoreA = dense((1, 2), (3, 4), (-3, -5))
- val inCoreB = dense((3, 5), (4, 6), (0, 1))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- .mapBlock()({
- case (keys, block) => keys.map(_.toString) -> block
- })
-
- val B = drmParallelize(inCoreB, numPartitions = 2)
- .mapBlock()({
- case (keys, block) => keys.map(_.toString) -> block
- })
-
- val C = A.t %*% B
-
- mahoutCtx.optimizerRewrite(C) should equal(OpAtB[String](A, B))
-
- val inCoreC = C.collect
- val inCoreControlC = inCoreA.t %*% inCoreB
-
- (inCoreC - inCoreControlC).norm should be < 1E-10
-
- }
-
- test("C = At %*% B , zippable, String-keyed") {
-
- val inCoreA = dense((1, 2), (3, 4), (-3, -5))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- .mapBlock()({
- case (keys, block) ⇒ keys.map(_.toString) → block
- })
-
- // Dense-A' x sparse-B used to produce error. We sparsify B here to test this as well.
- val B = (A + 1.0).mapBlock() { case (keys, block) ⇒
- keys → (new SparseRowMatrix(block.nrow, block.ncol) := block)
- }
-
- val C = A.t %*% B
-
- mahoutCtx.optimizerRewrite(C) should equal(OpAtB[String](A, B))
-
- val inCoreC = C.collect
- val inCoreControlC = inCoreA.t %*% (inCoreA + 1.0)
-
- (inCoreC - inCoreControlC).norm should be < 1E-10
-
- }
-
- test ("C = A %*% B.t") {
-
- val inCoreA = dense((1, 2), (3, 4), (-3, -5))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
-
- val B = A + 1.0
-
- val C = A %*% B.t
-
- mahoutCtx.optimizerRewrite(C) should equal(OpABt[Int](A, B))
-
- val inCoreC = C.collect
- val inCoreControlC = inCoreA %*% (inCoreA + 1.0).t
-
- (inCoreC - inCoreControlC).norm should be < 1E-10
-
- }
-
- test("C = A %*% inCoreB") {
-
- val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7))
- val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- val C = A %*% inCoreB
-
- val inCoreC = C.collect
- val inCoreCControl = inCoreA %*% inCoreB
-
- println(inCoreC)
- (inCoreC - inCoreCControl).norm should be < 1E-10
-
- }
-
- test("C = inCoreA %*%: B") {
-
- val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7))
- val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7))
-
- val B = drmParallelize(inCoreB, numPartitions = 2)
- val C = inCoreA %*%: B
-
- val inCoreC = C.collect
- val inCoreCControl = inCoreA %*% inCoreB
-
- println(inCoreC)
- (inCoreC - inCoreCControl).norm should be < 1E-10
-
- }
-
- test("C = A.t %*% A") {
- val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7))
- val A = drmParallelize(m = inCoreA, numPartitions = 2)
-
- val AtA = A.t %*% A
-
- // Assert optimizer detects square
- mahoutCtx.optimizerRewrite(action = AtA) should equal(OpAtA(A))
-
- val inCoreAtA = AtA.collect
- val inCoreAtAControl = inCoreA.t %*% inCoreA
-
- (inCoreAtA - inCoreAtAControl).norm should be < 1E-10
- }
-
- test("C = A.t %*% A fat non-graph") {
- // Hack the max in-mem size for this test
- System.setProperty("mahout.math.AtA.maxInMemNCol", "540")
-
- val inCoreA = Matrices.uniformView(400, 550, 1234)
- val A = drmParallelize(m = inCoreA, numPartitions = 2)
-
- val AtA = A.t %*% A
-
- // Assert optimizer detects square
- mahoutCtx.optimizerRewrite(action = AtA) should equal(OpAtA(A))
-
- val inCoreAtA = AtA.collect
- val inCoreAtAControl = inCoreA.t %*% inCoreA
-
- (inCoreAtA - inCoreAtAControl).norm should be < 1E-10
- }
-
- test("C = A.t %*% A non-int key") {
- val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7))
- val AintKeyd = drmParallelize(m = inCoreA, numPartitions = 2)
- val A = AintKeyd.mapBlock() {
- case (keys, block) => keys.map(_.toString) -> block
- }
-
- val AtA = A.t %*% A
-
- // Assert optimizer detects square
- mahoutCtx.optimizerRewrite(action = AtA) should equal(OpAtA(A))
-
- val inCoreAtA = AtA.collect
- val inCoreAtAControl = inCoreA.t %*% inCoreA
-
- (inCoreAtA - inCoreAtAControl).norm should be < 1E-10
- }
-
- test("C = A + B") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- val B = drmParallelize(inCoreB, numPartitions = 2)
-
- val C = A + B
- val inCoreC = C.collect
-
- // Actual
- val inCoreCControl = inCoreA + inCoreB
-
- (inCoreC - inCoreCControl).norm should be < 1E-10
- }
-
- test("C = A + B, identically partitioned") {
-
- val inCoreA = dense((1, 2, 3), (3, 4, 5), (5, 6, 7))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
-
-// printf("A.nrow=%d.\n", A.rdd.count())
-
- // Create B which would be identically partitioned to A. mapBlock() by default will do the trick.
- val B = A.mapBlock() {
- case (keys, block) =>
- val bBlock = block.like() := { (r, c, v) => util.Random.nextDouble()}
- keys -> bBlock
- }
- // Prevent repeated computation non-determinism
- // removing this checkpoint() will cause the same error in spark Tests
- // as we're seeing in Flink with this test. ie util.Random.nextDouble()
- // is being called more than once (note that it is not seeded in the closure)
- .checkpoint()
-
- val inCoreB = B.collect
-
- printf("A=\n%s\n", inCoreA)
- printf("B=\n%s\n", inCoreB)
-
- val C = A + B
-
- val inCoreC = C.collect
-
- printf("C=\n%s\n", inCoreC)
-
- // Actual
- val inCoreCControl = inCoreA + inCoreB
-
- (inCoreC - inCoreCControl).norm should be < 1E-10
- }
-
-
- test("C = A + B side test 1") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
-
- val A = drmParallelize(inCoreA, numPartitions = 2)
- val B = drmParallelize(inCoreB, numPartitions = 2)
-
- val C = A + B
- val inCoreC = C.collect
-
- val inCoreD = (A + B).collect
-
- // Actual
- val inCoreCControl = inCoreA + inCoreB
-
- (inCoreC - inCoreCControl).norm should be < 1E-10
- (inCoreD - inCoreCControl).norm should be < 1E-10
- }
-
- test("C = A + B side test 2") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
-
- val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint()
- val B = drmParallelize(inCoreB, numPartitions = 2)
-
- val C = A + B
- val inCoreC = C.collect
-
- val inCoreD = (A + B).collect
-
- // Actual
- val inCoreCControl = inCoreA + inCoreB
-
- (inCoreC - inCoreCControl).norm should be < 1E-10
- (inCoreD - inCoreCControl).norm should be < 1E-10
- }
-
- test("C = A + B side test 3") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
-
- val B = drmParallelize(inCoreB, numPartitions = 2)
- // val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY_SER)
- val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY)
-
- val C = A + B
- val inCoreC = C.collect
-
- val inCoreD = (A + B).collect
-
- // Actual
- val inCoreCControl = inCoreA + inCoreB * 2.0
-
- (inCoreC - inCoreCControl).norm should be < 1E-10
- (inCoreD - inCoreCControl).norm should be < 1E-10
- }
-
- test("Ax") {
- val inCoreA = dense(
- (1, 2),
- (3, 4),
- (20, 30)
- )
- val x = dvec(10, 3)
-
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- val ax = (drmA %*% x).collect(::, 0)
-
- ax should equal(inCoreA %*% x)
- }
-
- test("A'x") {
- val inCoreA = dense(
- (1, 2),
- (3, 4),
- (20, 30)
- )
- val x = dvec(10, 3, 4)
-
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- mahoutCtx.optimizerRewrite(drmA.t %*% x) should equal(OpAtx(drmA, x))
-
- val atx = (drmA.t %*% x).collect(::, 0)
-
- atx should equal(inCoreA.t %*% x)
- }
-
- test("colSums, colMeans") {
- val inCoreA = dense(
- (1, 2),
- (3, 4),
- (20, 30)
- )
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- drmA.colSums() should equal(inCoreA.colSums())
- drmA.colMeans() should equal(inCoreA.colMeans())
- }
-
- test("rowSums, rowMeans") {
- val inCoreA = dense(
- (1, 2),
- (3, 4),
- (20, 30)
- )
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- drmA.rowSums() should equal(inCoreA.rowSums())
- drmA.rowMeans() should equal(inCoreA.rowMeans())
- }
-
- test("A.diagv") {
- val inCoreA = dense(
- (1, 2, 3),
- (3, 4, 5),
- (20, 30, 7)
- )
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- drmA.diagv should equal(inCoreA.diagv)
- }
-
- test("numNonZeroElementsPerColumn") {
- val inCoreA = dense(
- (0, 2),
- (3, 0),
- (0, -30)
-
- )
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- drmA.numNonZeroElementsPerColumn() should equal(inCoreA.numNonZeroElementsPerColumn())
- }
-
- test("C = A cbind B, cogroup") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val inCoreB = dense((3, 5), (4, 6))
- val controlC = dense((1, 2, 3, 5), (3, 4, 4, 6))
-
- val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint()
- val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint()
-
- (A.cbind(B) -: controlC).norm should be < 1e-10
-
- }
-
- test("C = A cbind B, zip") {
-
- val inCoreA = dense((1, 2), (3, 4))
- val controlC = dense((1, 2, 2, 3), (3, 4, 4, 5))
-
- val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint()
-
- (A.cbind(A + 1.0) -: controlC).norm should be < 1e-10
-
- }
-
- test("B = 1 cbind A") {
- val inCoreA = dense((1, 2), (3, 4))
- val control = dense((1, 1, 2), (1, 3, 4))
-
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- (control - (1 cbind drmA) ).norm should be < 1e-10
- }
-
- test("B = A cbind 1") {
- val inCoreA = dense((1, 2), (3, 4))
- val control = dense((1, 2, 1), (3, 4, 1))
-
- val drmA = drmParallelize(inCoreA, numPartitions = 2)
-
- (control - (drmA cbind 1) ).norm should be < 1e-10
- }
-
- test("B = A + 1.0") {
- val inCoreA = dense((1, 2), (2, 3), (3, 4))
- val controlB = inCoreA + 1.0
-
- val drmB = drmParallelize(m = inCoreA, numPartitions = 2) + 1.0
-
- (drmB -: controlB).norm should be < 1e-10
- }
-
- test("C = A rbind B") {
-
- val inCoreA = dense((1, 2), (3, 5))
- val inCoreB = dense((7, 11), (13, 17))
- val controlC = dense((1, 2), (3, 5), (7, 11), (13, 17))
-
- val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint()
- val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint()
-
- (A.rbind(B) -: controlC).norm should be < 1e-10
- }
-
- test("C = A rbind B, with empty") {
-
- val inCoreA = dense((1, 2), (3, 5))
- val emptyB = drmParallelizeEmpty(nrow = 2, ncol = 2, numPartitions = 2)
- val controlC = dense((1, 2), (3, 5), (0, 0), (0, 0))
-
- val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint()
-
- (A.rbind(emptyB) -: controlC).norm should be < 1e-10
- }
-
- /** Test dsl overloads over scala operations over matrices */
- test("scalarOps") {
- val drmA = drmParallelize(m = dense(
- (1, 2, 3),
- (3, 4, 5),
- (7, 8, 9)
- ),
- numPartitions = 2)
-
- (10 * drmA - (10 *: drmA)).norm shouldBe 0
-
- }
-
- test("A * A -> sqr(A) rewrite ") {
- val mxA = dense(
- (1, 2, 3),
- (3, 4, 5),
- (7, 8, 9)
- )
-
- val mxAAControl = mxA * mxA
-
- val drmA = drmParallelize(mxA, 2)
- val drmAA = drmA * drmA
-
- val optimized = drmAA.context.engine.optimizerRewrite(drmAA)
- println(s"optimized:$optimized")
- optimized.isInstanceOf[OpAewUnaryFunc[Int]] shouldBe true
-
- (mxAAControl -= drmAA).norm should be < 1e-10
- }
-
- test("B = 1 + 2 * (A * A) ew unary function fusion") {
- val mxA = dense(
- (1, 2, 3),
- (3, 0, 5)
- )
- val controlB = mxA.cloned := { (x) => 1 + 2 * x * x}
-
- val drmA = drmParallelize(mxA, 2)
-
- // We need to use parenthesis, otherwise optimizer will see it as (2A) * (A) and that would not
- // be rewritten as 2 * sqr(A). It is not that clever (yet) to try commutativity optimizations.
- val drmB = 1 + 2 * (drmA * drmA)
-
- val optimized = mahoutCtx.engine.optimizerRewrite(drmB)
- println(s"optimizer rewritten:$optimized")
- optimized.isInstanceOf[OpAewUnaryFuncFusion[Int]] shouldBe true
-
- (controlB - drmB).norm should be < 1e-10
-
- }
-
- test("functional apply()") {
- val mxA = sparse (
- (1 -> 3) :: (7 -> 7) :: Nil,
- (4 -> 5) :: (5 -> 8) :: Nil
- )
-
- val mxAControl = mxA cloned
- val drmA = drmParallelize(mxA)
-
- (drmA(x => x + 1).collect - (mxAControl + 1)).norm should be < 1e-7
- (drmA(x => x * 2).collect - (2 * mxAControl)).norm should be < 1e-7
-
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala
deleted file mode 100644
index cf62eea..0000000
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.Vector
-import org.apache.mahout.test.MahoutSuite
-import org.scalatest.FunSuite
-import org.apache.mahout.math.scalabindings.MahoutCollections._
-import org.apache.mahout.math._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-
-class MahoutCollectionsSuite extends FunSuite with MahoutSuite {
- test("toArray") {
- val a = Array(1.0, 2.0, 3.0)
- val v: Vector = new org.apache.mahout.math.DenseVector(a)
-
- v.toArray.deep shouldBe a.deep
-
- }
-
- test("toMap") {
- val m = Map( (1 -> 1.0), (3 -> 3.0))
- val sv = svec(m)
-
- sv.toMap shouldBe m
- }
-}

r***@apache.org

2018-06-27 14:51:49 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/cf/SimilarityAnalysis.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/cf/SimilarityAnalysis.scala b/math-scala/src/main/scala/org/apache/mahout/math/cf/SimilarityAnalysis.scala
deleted file mode 100644
index f69bf81..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/cf/SimilarityAnalysis.scala
+++ /dev/null
@@ -1,453 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.cf
-
-import org.apache.mahout.math._
-import org.apache.mahout.math.indexeddataset.IndexedDataset
-import scalabindings._
-import RLikeOps._
-import drm._
-import RLikeDrmOps._
-import scala.collection.JavaConversions._
-import org.apache.mahout.math.stats.LogLikelihood
-import collection._
-import org.apache.mahout.math.function.{VectorFunction, Functions}
-
-import scala.util.Random
-
-
-/**
- * Based on "Ted Dunnning & Ellen Friedman: Practical Machine Learning, Innovations in Recommendation",
- * available at http://www.mapr.com/practical-machine-learning
- *
- * see also "Sebastian Schelter, Christoph Boden, Volker Markl:
- * Scalable Similarity-Based Neighborhood Methods with MapReduce
- * ACM Conference on Recommender Systems 2012"
- */
-object SimilarityAnalysis extends Serializable {
-
- /** Compares (Int,Double) pairs by the second value */
- private val orderByScore = Ordering.fromLessThan[(Int, Double)] { case ((_, score1), (_, score2)) => score1 > score2}
-
- lazy val defaultParOpts = ParOpts()
-
- /**
- * Calculates item (column-wise) similarity using the log-likelihood ratio on A'A, A'B, A'C, ...
- * and returns a list of similarity and cross-similarity matrices
- *
- * @param drmARaw Primary interaction matrix
- * @param randomSeed when kept to a constant will make repeatable downsampling
- * @param maxInterestingItemsPerThing number of similar items to return per item, default: 50
- * @param maxNumInteractions max number of interactions after downsampling, default: 500
- * @param parOpts partitioning params for drm.par(...)
- * @return a list of [[org.apache.mahout.math.drm.DrmLike]] containing downsampled DRMs for cooccurrence and
- * cross-cooccurrence
- */
- def cooccurrences(
- drmARaw: DrmLike[Int],
- randomSeed: Int = 0xdeadbeef,
- maxInterestingItemsPerThing: Int = 50,
- maxNumInteractions: Int = 500,
- drmBs: Array[DrmLike[Int]] = Array(),
- parOpts: ParOpts = defaultParOpts)
- : List[DrmLike[Int]] = {
-
- implicit val distributedContext = drmARaw.context
-
- // backend partitioning defaults to 'auto', which is often better decided by calling funciton
- // todo: this should ideally be different per drm
- drmARaw.par( min = parOpts.minPar, exact = parOpts.exactPar, auto = parOpts.autoPar)
-
- // Apply selective downsampling, pin resulting matrix
- val drmA = sampleDownAndBinarize(drmARaw, randomSeed, maxNumInteractions)
-
- // num users, which equals the maximum number of interactions per item
- val numUsers = drmA.nrow.toInt
-
- // Compute & broadcast the number of interactions per thing in A
- val bcastInteractionsPerItemA = drmBroadcast(drmA.numNonZeroElementsPerColumn)
-
- // Compute cooccurrence matrix A'A
- val drmAtA = drmA.t %*% drmA
-
- // Compute loglikelihood scores and sparsify the resulting matrix to get the similarity matrix
- val drmSimilarityAtA = computeSimilarities(drmAtA, numUsers, maxInterestingItemsPerThing,
- bcastInteractionsPerItemA, bcastInteractionsPerItemA, crossCooccurrence = false)
-
- var similarityMatrices = List(drmSimilarityAtA)
-
- // Now look at cross cooccurrences
- for (drmBRaw <- drmBs) {
- // backend partitioning defaults to 'auto', which is often better decided by calling funciton
- // todo: this should ideally be different per drm
- drmARaw.par( min = parOpts.minPar, exact = parOpts.exactPar, auto = parOpts.autoPar)
-
- // Down-sample and pin other interaction matrix
- val drmB = sampleDownAndBinarize(drmBRaw, randomSeed, maxNumInteractions).checkpoint()
-
- // Compute & broadcast the number of interactions per thing in B
- val bcastInteractionsPerThingB = drmBroadcast(drmB.numNonZeroElementsPerColumn)
-
- // Compute cross-cooccurrence matrix A'B
- val drmAtB = drmA.t %*% drmB
-
- val drmSimilarityAtB = computeSimilarities(drmAtB, numUsers, maxInterestingItemsPerThing,
- bcastInteractionsPerItemA, bcastInteractionsPerThingB)
-
- similarityMatrices = similarityMatrices :+ drmSimilarityAtB
-
- drmB.uncache()
- }
-
- // Unpin downsampled interaction matrix
- drmA.uncache()
-
- // Return list of similarity matrices
- similarityMatrices
- }
-
- /**
- * Calculates item (column-wise) similarity using the log-likelihood ratio on A'A, A'B, A'C, ... and returns
- * a list of similarity and cross-similarity matrices. Somewhat easier to use method, which handles the ID
- * dictionaries correctly
- *
- * @param indexedDatasets first in array is primary/A matrix all others are treated as secondary
- * @param randomSeed use default to make repeatable, otherwise pass in system time or some randomizing seed
- * @param maxInterestingItemsPerThing max similarities per items
- * @param maxNumInteractions max number of input items per item
- * @param parOpts partitioning params for drm.par(...)
- * @return a list of [[org.apache.mahout.math.indexeddataset.IndexedDataset]] containing downsampled
- * IndexedDatasets for cooccurrence and cross-cooccurrence
- */
- def cooccurrencesIDSs(
- indexedDatasets: Array[IndexedDataset],
- randomSeed: Int = 0xdeadbeef,
- maxInterestingItemsPerThing: Int = 50,
- maxNumInteractions: Int = 500,
- parOpts: ParOpts = defaultParOpts):
- List[IndexedDataset] = {
- val drms = indexedDatasets.map(_.matrix.asInstanceOf[DrmLike[Int]])
- val primaryDrm = drms(0)
- val secondaryDrms = drms.drop(1)
- val coocMatrices = cooccurrences(primaryDrm, randomSeed, maxInterestingItemsPerThing,
- maxNumInteractions, secondaryDrms, parOpts)
- val retIDSs = coocMatrices.iterator.zipWithIndex.map {
- case( drm, i ) =>
- indexedDatasets(0).create(drm, indexedDatasets(0).columnIDs, indexedDatasets(i).columnIDs)
- }
- retIDSs.toList
- }
-
- /**
- * Calculates item (column-wise) similarity using the log-likelihood ratio on A'A, A'B, A'C, ... and returns
- * a list of similarity and cross-occurrence matrices. Somewhat easier to use method, which handles the ID
- * dictionaries correctly and contains info about downsampling in each model calc.
- *
- * @param datasets first in array is primary/A matrix all others are treated as secondary, includes information
- * used to downsample the input drm as well as the output llr(A'A), llr(A'B). The information
- * is contained in each dataset in the array and applies to the model calculation of A' with
- * the dataset. Todo: ignoring absolute threshold for now.
- * @param randomSeed use default to make repeatable, otherwise pass in system time or some randomizing seed
- * @param parOpts partitioning params for drm.par(...)
- * @return a list of [[org.apache.mahout.math.indexeddataset.IndexedDataset]] containing downsampled
- * IndexedDatasets for cooccurrence and cross-cooccurrence
- */
- def crossOccurrenceDownsampled(
- datasets: List[DownsamplableCrossOccurrenceDataset],
- randomSeed: Int = 0xdeadbeef):
- List[IndexedDataset] = {
-
-
- val crossDatasets = datasets.drop(1) // drop A
- val primaryDataset = datasets.head // use A throughout
- val drmARaw = primaryDataset.iD.matrix
-
- implicit val distributedContext = primaryDataset.iD.matrix.context
-
- // backend partitioning defaults to 'auto', which is often better decided by calling funciton
- val parOptsA = primaryDataset.parOpts.getOrElse(defaultParOpts)
- drmARaw.par( min = parOptsA.minPar, exact = parOptsA.exactPar, auto = parOptsA.autoPar)
-
- // Apply selective downsampling, pin resulting matrix
- val drmA = sampleDownAndBinarize(drmARaw, randomSeed, primaryDataset.maxElementsPerRow)
-
- // num users, which equals the maximum number of interactions per item
- val numUsers = drmA.nrow.toInt
-
- // Compute & broadcast the number of interactions per thing in A
- val bcastInteractionsPerItemA = drmBroadcast(drmA.numNonZeroElementsPerColumn)
-
- // Compute cooccurrence matrix A'A
- val drmAtA = drmA.t %*% drmA
-
- // Compute loglikelihood scores and sparsify the resulting matrix to get the similarity matrix
- val drmSimilarityAtA = computeSimilarities(drmAtA, numUsers, primaryDataset.maxInterestingElements,
- bcastInteractionsPerItemA, bcastInteractionsPerItemA, crossCooccurrence = false,
- minLLROpt = primaryDataset.minLLROpt)
-
- var similarityMatrices = List(drmSimilarityAtA)
-
- // Now look at cross cooccurrences
- for (dataset <- crossDatasets) {
- // backend partitioning defaults to 'auto', which is often better decided by calling funciton
- val parOptsB = dataset.parOpts.getOrElse(defaultParOpts)
- dataset.iD.matrix.par(min = parOptsB.minPar, exact = parOptsB.exactPar, auto = parOptsB.autoPar)
-
- // Downsample and pin other interaction matrix
- val drmB = sampleDownAndBinarize(dataset.iD.matrix, randomSeed, dataset.maxElementsPerRow).checkpoint()
-
- // Compute & broadcast the number of interactions per thing in B
- val bcastInteractionsPerThingB = drmBroadcast(drmB.numNonZeroElementsPerColumn)
-
- // Compute cross-cooccurrence matrix A'B
- val drmAtB = drmA.t %*% drmB
-
- val drmSimilarityAtB = computeSimilarities(drmAtB, numUsers, dataset.maxInterestingElements,
- bcastInteractionsPerItemA, bcastInteractionsPerThingB, minLLROpt = dataset.minLLROpt)
-
- similarityMatrices = similarityMatrices :+ drmSimilarityAtB
-
- drmB.uncache()
- }
-
- // Unpin downsampled interaction matrix
- drmA.uncache()
-
- // Return list of datasets
- val retIDSs = similarityMatrices.iterator.zipWithIndex.map {
- case( drm, i ) =>
- datasets(0).iD.create(drm, datasets(0).iD.columnIDs, datasets(i).iD.columnIDs)
- }
- retIDSs.toList
-
- }
-
- /**
- * Calculates row-wise similarity using the log-likelihood ratio on AA' and returns a DRM of rows and similar rows
- *
- * @param drmARaw Primary interaction matrix
- * @param randomSeed when kept to a constant will make repeatable downsampling
- * @param maxInterestingSimilaritiesPerRow number of similar items to return per item, default: 50
- * @param maxNumInteractions max number of interactions after downsampling, default: 500
- * @param parOpts partitioning options used for drm.par(...)
- */
- def rowSimilarity(
- drmARaw: DrmLike[Int],
- randomSeed: Int = 0xdeadbeef,
- maxInterestingSimilaritiesPerRow: Int = 50,
- maxNumInteractions: Int = 500,
- parOpts: ParOpts = defaultParOpts): DrmLike[Int] = {
-
- implicit val distributedContext = drmARaw.context
-
- // backend partitioning defaults to 'auto', which is often better decided by calling funciton
- // todo: should this ideally be different per drm?
- drmARaw.par(min = parOpts.minPar, exact = parOpts.exactPar, auto = parOpts.autoPar)
-
- // Apply selective downsampling, pin resulting matrix
- val drmA = sampleDownAndBinarize(drmARaw, randomSeed, maxNumInteractions)
-
- // num columns, which equals the maximum number of interactions per item
- val numCols = drmA.ncol
-
- // Compute & broadcast the number of interactions per row in A
- val bcastInteractionsPerItemA = drmBroadcast(drmA.numNonZeroElementsPerRow)
-
- // Compute row similarity cooccurrence matrix AA'
- val drmAAt = drmA %*% drmA.t
-
- // Compute loglikelihood scores and sparsify the resulting matrix to get the similarity matrix
- val drmSimilaritiesAAt = computeSimilarities(drmAAt, numCols, maxInterestingSimilaritiesPerRow,
- bcastInteractionsPerItemA, bcastInteractionsPerItemA, crossCooccurrence = false)
-
- drmSimilaritiesAAt
- }
-
- /**
- * Calculates row-wise similarity using the log-likelihood ratio on AA' and returns a drm of rows and similar rows.
- * Uses IndexedDatasets, which handle external ID dictionaries properly
- *
- * @param indexedDataset compare each row to every other
- * @param randomSeed use default to make repeatable, otherwise pass in system time or some randomizing seed
- * @param maxInterestingSimilaritiesPerRow max elements returned in each row
- * @param maxObservationsPerRow max number of input elements to use
- */
- def rowSimilarityIDS(indexedDataset: IndexedDataset, randomSeed: Int = 0xdeadbeef,
- maxInterestingSimilaritiesPerRow: Int = 50,
- maxObservationsPerRow: Int = 500):
- IndexedDataset = {
- val coocMatrix = rowSimilarity(indexedDataset.matrix, randomSeed, maxInterestingSimilaritiesPerRow,
- maxObservationsPerRow)
- indexedDataset.create(coocMatrix, indexedDataset.rowIDs, indexedDataset.rowIDs)
- }
-
- /** Compute loglikelihood ratio see http://tdunning.blogspot.de/2008/03/surprise-and-coincidence.html for details */
- def logLikelihoodRatio(numInteractionsWithA: Long, numInteractionsWithB: Long,
- numInteractionsWithAandB: Long, numInteractions: Long) = {
-
- val k11 = numInteractionsWithAandB
- val k12 = numInteractionsWithA - numInteractionsWithAandB
- val k21 = numInteractionsWithB - numInteractionsWithAandB
- val k22 = numInteractions - numInteractionsWithA - numInteractionsWithB + numInteractionsWithAandB
-
- LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22)
-
- }
-
- def computeSimilarities(
- drm: DrmLike[Int],
- numUsers: Int,
- maxInterestingItemsPerThing: Int,
- bcastNumInteractionsB: BCast[Vector],
- bcastNumInteractionsA: BCast[Vector],
- crossCooccurrence: Boolean = true,
- minLLROpt: Option[Double] = None) = {
-
- //val minLLR = minLLROpt.getOrElse(0.0d) // accept all values if not specified
-
- val minLLR = minLLROpt
-
- drm.mapBlock() {
- case (keys, block) =>
-
- val llrBlock = block.like()
- val numInteractionsB: Vector = bcastNumInteractionsB
- val numInteractionsA: Vector = bcastNumInteractionsA
-
- for (index <- 0 until keys.size) {
-
- val thingB = keys(index)
-
- // PriorityQueue to select the top-k items
- val topItemsPerThing = new mutable.PriorityQueue[(Int, Double)]()(orderByScore)
-
- block(index, ::).nonZeroes().foreach { elem =>
- val thingA = elem.index
- val cooccurrences = elem.get
-
- // exclude co-occurrences of the item with itself
- if (crossCooccurrence || thingB != thingA) {
- // Compute loglikelihood ratio
- val llr = logLikelihoodRatio(numInteractionsB(thingB).toLong, numInteractionsA(thingA).toLong,
- cooccurrences.toLong, numUsers)
-
- val candidate = thingA -> llr
-
- // legacy hadoop code maps values to range (0..1) via
- // val normailizedLLR = 1.0 - (1.0 / (1.0 + llr))
- // val candidate = thingA -> normailizedLLR
-
- // Enqueue item with score, if belonging to the top-k
- if(minLLR.isEmpty || llr >= minLLR.get) { // llr threshold takes precedence over max per row
- if (topItemsPerThing.size < maxInterestingItemsPerThing) {
- topItemsPerThing.enqueue(candidate)
- } else if (orderByScore.lt(candidate, topItemsPerThing.head)) {
- topItemsPerThing.dequeue()
- topItemsPerThing.enqueue(candidate)
- }
- }
- }
- }
-
- // Add top-k interesting items to the output matrix
- topItemsPerThing.dequeueAll.foreach {
- case (otherThing, llrScore) =>
- llrBlock(index, otherThing) = llrScore
- }
- }
-
- keys -> llrBlock
- }
- }
-
- /**
- * Selectively downsample rows and items with an anomalous amount of interactions, inspired by
- * https://github.com/tdunning/in-memory-cooccurrence/blob/master/src/main/java/com/tdunning/cooc/Analyze.java
- *
- * additionally binarizes input matrix, as we're only interesting in knowing whether interactions happened or not
- *
- * @param drmM matrix to downsample
- * @param seed random number generator seed, keep to a constant if repeatability is neccessary
- * @param maxNumInteractions number of elements in a row of the returned matrix
- * @return the downsampled DRM
- */
- def sampleDownAndBinarize(drmM: DrmLike[Int], seed: Int, maxNumInteractions: Int) = {
-
- implicit val distributedContext = drmM.context
-
- // Pin raw interaction matrix
- val drmI = drmM.checkpoint()
-
- // Broadcast vector containing the number of interactions with each thing
- val bcastNumInteractions = drmBroadcast(drmI.numNonZeroElementsPerColumn)
-
- val downSampledDrmI = drmI.mapBlock() {
- case (keys, block) =>
- val numInteractions: Vector = bcastNumInteractions
-
- // Use a hash of the unique first key to seed the RNG, makes this computation repeatable in case of
- //failures
- val random = new Random(MurmurHash.hash(keys(0), seed))
-
- val downsampledBlock = block.like()
-
- // Downsample the interaction vector of each row
- for (rowIndex <- 0 until keys.size) {
-
- val interactionsInRow = block(rowIndex, ::)
-
- val numInteractionsPerRow = interactionsInRow.getNumNonZeroElements()
-
- val perRowSampleRate = math.min(maxNumInteractions, numInteractionsPerRow) / numInteractionsPerRow
-
- interactionsInRow.nonZeroes().foreach { elem =>
- val numInteractionsWithThing = numInteractions(elem.index)
- val perThingSampleRate = math.min(maxNumInteractions, numInteractionsWithThing) / numInteractionsWithThing
-
- if (random.nextDouble() <= math.min(perRowSampleRate, perThingSampleRate)) {
- // We ignore the original interaction value and create a binary 0-1 matrix
- // as we only consider whether interactions happened or did not happen
- downsampledBlock(rowIndex, elem.index) = 1
- }
- }
- }
-
- keys -> downsampledBlock
- }
-
- // Unpin raw interaction matrix
- drmI.uncache()
-
- downSampledDrmI
- }
-}
-
-case class ParOpts( // this will contain the default `par` params except for auto = true
- minPar: Int = -1,
- exactPar: Int = -1,
- autoPar: Boolean = true)
-
-/* Used to pass in data and params for downsampling the input data as well as output A'A, A'B, etc. */
-case class DownsamplableCrossOccurrenceDataset(
- iD: IndexedDataset,
- maxElementsPerRow: Int = 500, // usually items per user in the input dataset, used to ramdomly downsample
- maxInterestingElements: Int = 50, // number of items/columns to keep in the A'A, A'B etc. where iD == A, B, C ...
- minLLROpt: Option[Double] = None, // absolute threshold, takes precedence over maxInterestingElements if present
- parOpts: Option[ParOpts] = None) // these can be set per dataset and are applied to each of the drms
- // in crossOccurrenceDownsampled
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/decompositions/ALS.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/ALS.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/ALS.scala
deleted file mode 100644
index 8ced112..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/ALS.scala
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decompositions
-
-import org.apache.mahout.math._
-import drm._
-import scalabindings._
-import RLikeDrmOps._
-import RLikeOps._
-import org.apache.log4j.Logger
-import math._
-import org.apache.mahout.common.RandomUtils
-
-/** Simple ALS factorization algotithm. To solve, use train() method. */
-private[math] object ALS {
-
- private val log = Logger.getLogger(ALS.getClass)
-
- /**
- * ALS training result. 
- *
- * <code>drmU %*% drmV.t</code> is supposed to approximate the input.
- *
- * @param drmU U matrix
- * @param drmV V matrix
- * @param iterationsRMSE RMSE values afeter each of iteration performed
- */
- class Result[K](val drmU: DrmLike[K], val drmV: DrmLike[Int], val iterationsRMSE: Iterable[Double]) {
- def toTuple = (drmU, drmV, iterationsRMSE)
- }
-
- /** Result class for in-core results */
- class InCoreResult(val inCoreU: Matrix, inCoreV: Matrix, val iterationsRMSE: Iterable[Double]) {
- def toTuple = (inCoreU, inCoreV, iterationsRMSE)
- }
-
- /**
- * Run Distributed ALS.
- * 
- *
- * Example:
- *
- * <pre>
- * val (u,v,errors) = als(input, k).toTuple
- * </pre>
- *
- * ALS runs until (rmse[i-1]-rmse[i])/rmse[i-1] < convergenceThreshold, or i==maxIterations,
- * whichever earlier.
- * 
- *
- * @param drmA The input matrix
- * @param k required rank of decomposition (number of cols in U and V results)
- * @param convergenceThreshold stop sooner if (rmse[i-1] - rmse[i])/rmse[i - 1] is less than this
- * value. If <=0 then we won't compute RMSE and use convergence test.
- * @param lambda regularization rate
- * @param maxIterations maximum iterations to run regardless of convergence
- * @tparam K row key type of the input (100 is probably more than enough)
- * @return { @link org.apache.mahout.math.drm.decompositions.ALS.Result}
- */
- def dals[K](
- drmA: DrmLike[K],
- k: Int = 50,
- lambda: Double = 0.0,
- maxIterations: Int = 10,
- convergenceThreshold: Double = 0.10
- ): Result[K] = {
-
- assert(convergenceThreshold < 1.0, "convergenceThreshold")
- assert(maxIterations >= 1, "maxIterations")
-
- // Some mapblock() usage may require to know ClassTag[K] bound
- implicit val ktag = drmA.keyClassTag
-
- val drmAt = drmA.t
-
- // Initialize U and V so that they are identically distributed to A or A'
- var drmU = drmA.mapBlock(ncol = k) {
- case (keys, block) =>
- val rnd = RandomUtils.getRandom()
- val uBlock = Matrices.symmetricUniformView(block.nrow, k, rnd.nextInt()) * 0.01
- keys -> uBlock
- }
-
- var drmV: DrmLike[Int] = null
- var rmseIterations: List[Double] = Nil
-
- // ALS iterator
- var stop = false
- var i = 0
- while (!stop && i < maxIterations) {
-
- // Alternate. This is really what ALS is.
- if (drmV != null) drmV.uncache()
- drmV = (drmAt %*% drmU %*% solve(drmU.t %*% drmU -: diag(lambda, k))).checkpoint()
-
- drmU.uncache()
- drmU = (drmA %*% drmV %*% solve(drmV.t %*% drmV -: diag(lambda, k))).checkpoint()
-
- // Check if we are requested to do a convergence test; and do it if yes.
- if (convergenceThreshold > 0) {
-
- val rmse = (drmA - drmU %*% drmV.t).norm / sqrt(drmA.ncol * drmA.nrow)
-
- if (i > 0) {
- val rmsePrev = rmseIterations.last
- val convergence = (rmsePrev - rmse) / rmsePrev
-
- if (convergence < 0) {
- log.warn("Rmse increase of %f. Should not happen.".format(convergence))
- // I guess error growth can happen in ideal data case?
- stop = true
- } else if (convergence < convergenceThreshold) {
- stop = true
- }
- }
- rmseIterations :+= rmse
- }
-
- i += 1
- }
-
- new Result(drmU, drmV, rmseIterations)
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
deleted file mode 100644
index 389eba0..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decompositions
-
-import org.apache.mahout.logging._
-import org.apache.mahout.math.Matrix
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.drm._
-import RLikeDrmOps._
-
-object DQR {
-
- private final implicit val log = getLog(DQR.getClass)
-
- /**
- * Distributed _thin_ QR. A'A must fit in a memory, i.e. if A is m x n, then n should be pretty
- * controlled (<5000 or so). 
- *
- * It is recommended to checkpoint A since it does two passes over it. 
- *
- * It also guarantees that Q is partitioned exactly the same way (and in same key-order) as A, so
- * their RDD should be able to zip successfully.
- */
- def dqrThin[K](drmA: DrmLike[K],
- checkRankDeficiency: Boolean = true,
- cacheHint: CacheHint.CacheHint = CacheHint.MEMORY_ONLY): (DrmLike[K], Matrix) = {
-
- // Some mapBlock() calls need it
- implicit val ktag = drmA.keyClassTag
-
- if (drmA.ncol > 5000)
- warn("A is too fat. A'A must fit in memory and easily broadcasted.")
-
- implicit val ctx = drmA.context
-
- val AtA = (drmA.t %*% drmA).checkpoint(cacheHint)
- val inCoreAtA = AtA.collect
-
- trace("A'A=\n%s\n".format(inCoreAtA))
-
- val ch = chol(inCoreAtA)
- val inCoreR = (ch.getL cloned) t
-
- trace("R=\n%s\n".format(inCoreR))
-
- if (checkRankDeficiency && !ch.isPositiveDefinite)
- throw new IllegalArgumentException("R is rank-deficient.")
-
- val bcastAtA = drmBroadcast(inCoreAtA)
-
- // Unfortunately, I don't think Cholesky decomposition is serializable to backend. So we re-
- // decompose A'A in the backend again.
-
- // Compute Q = A*inv(L') -- we can do it blockwise.
- val Q = drmA.mapBlock() {
- case (keys, block) => keys -> chol(bcastAtA).solveRight(block)
- }
-
- Q -> inCoreR
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSPCA.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSPCA.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSPCA.scala
deleted file mode 100644
index 2c010bb..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSPCA.scala
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decompositions
-
-import org.apache.mahout.math.{Matrices, Vector}
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.drm._
-import RLikeDrmOps._
-import org.apache.mahout.common.RandomUtils
-
-object DSPCA {
-
- /**
- * Distributed Stochastic PCA decomposition algorithm. A logical reflow of the "SSVD-PCA options.pdf"
- * document of the MAHOUT-817.
- *
- * @param drmA input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations (hint: use either 0 or 1)
- * @return (U,V,s). Note that U, V are non-checkpointed matrices (i.e. one needs to actually use them
- * e.g. save them to hdfs in order to trigger their computation.
- */
- def dspca[K](drmA: DrmLike[K],
- k: Int,
- p: Int = 15,
- q: Int = 0,
- cacheHint: CacheHint.CacheHint = CacheHint.MEMORY_ONLY):
- (DrmLike[K], DrmLike[Int], Vector) = {
-
- // Some mapBlock() calls need it
- implicit val ktag = drmA.keyClassTag
-
- val drmAcp = drmA.checkpoint(cacheHint)
- implicit val ctx = drmAcp.context
-
- val m = drmAcp.nrow
- val n = drmAcp.ncol
- assert(k <= (m min n), "k cannot be greater than smaller of m, n.")
- val pfxed = safeToNonNegInt((m min n) - k min p)
-
- // Actual decomposition rank
- val r = k + pfxed
-
- // Dataset mean
- val mu = drmAcp.colMeans
-
- val mtm = mu dot mu
-
- // We represent Omega by its seed.
- val omegaSeed = RandomUtils.getRandom().nextInt()
- val omega = Matrices.symmetricUniformView(n, r, omegaSeed)
-
- // This done in front in a single-threaded fashion for now. Even though it doesn't require any
- // memory beyond that is required to keep xi around, it still might be parallelized to backs
- // for significantly big n and r. TODO
- val s_o = omega.t %*% mu
-
- val bcastS_o = drmBroadcast(s_o)
- val bcastMu = drmBroadcast(mu)
-
- var drmY = drmAcp.mapBlock(ncol = r) {
- case (keys, blockA) ⇒
- val s_o:Vector = bcastS_o
- val blockY = blockA %*% Matrices.symmetricUniformView(n, r, omegaSeed)
- for (row ← 0 until blockY.nrow) blockY(row, ::) -= s_o
- keys → blockY
- }
- // Checkpoint Y
- .checkpoint(cacheHint)
-
- var drmQ = dqrThin(drmY, checkRankDeficiency = false)._1.checkpoint(cacheHint)
-
- var s_q = drmQ.colSums()
- var bcastVarS_q = drmBroadcast(s_q)
-
- // This actually should be optimized as identically partitioned map-side A'B since A and Q should
- // still be identically partitioned.
- var drmBt = (drmAcp.t %*% drmQ).checkpoint(cacheHint)
-
- var s_b = (drmBt.t %*% mu).collect(::, 0)
- var bcastVarS_b = drmBroadcast(s_b)
-
- for (i ← 0 until q) {
-
- // These closures don't seem to live well with outside-scope vars. This doesn't record closure
- // attributes correctly. So we create additional set of vals for broadcast vars to properly
- // create readonly closure attributes in this very scope.
- val bcastS_q = bcastVarS_q
- val bcastMuInner = bcastMu
-
- // Fix Bt as B' -= xi cross s_q
- drmBt = drmBt.mapBlock() {
- case (keys, block) ⇒
- val s_q: Vector = bcastS_q
- val mu: Vector = bcastMuInner
- keys.zipWithIndex.foreach {
- case (key, idx) ⇒ block(idx, ::) -= s_q * mu(key)
- }
- keys → block
- }
-
- drmY.uncache()
- drmQ.uncache()
-
- val bCastSt_b = drmBroadcast(s_b -=: mtm * s_q)
-
- drmY = (drmAcp %*% drmBt)
- // Fix Y by subtracting st_b from each row of the AB'
- .mapBlock() {
- case (keys, block) ⇒
- val st_b: Vector = bCastSt_b
- block := { (_, c, v) ⇒ v - st_b(c) }
- keys → block
- }
- // Checkpoint Y
- .checkpoint(cacheHint)
-
- drmQ = dqrThin(drmY, checkRankDeficiency = false)._1.checkpoint(cacheHint)
-
- s_q = drmQ.colSums()
- bcastVarS_q = drmBroadcast(s_q)
-
- // This on the other hand should be inner-join-and-map A'B optimization since A and Q_i are not
- // identically partitioned anymore.
- drmBt = (drmAcp.t %*% drmQ).checkpoint(cacheHint)
-
- s_b = (drmBt.t %*% mu).collect(::, 0)
- bcastVarS_b = drmBroadcast(s_b)
- }
-
- val c = s_q cross s_b
- val inCoreBBt = (drmBt.t %*% drmBt).checkpoint(cacheHint).collect -=:
- c -=: c.t +=: mtm *=: (s_q cross s_q)
- val (inCoreUHat, d) = eigen(inCoreBBt)
- val s = d.sqrt
-
- // Since neither drmU nor drmV are actually computed until actually used, we don't need the flags
- // instructing compute (or not compute) either of the U,V outputs anymore. Neat, isn't it?
- val drmU = drmQ %*% inCoreUHat
- val drmV = drmBt %*% (inCoreUHat %*% diagv(1 / s))
-
- (drmU(::, 0 until k), drmV(::, 0 until k), s(0 until k))
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
deleted file mode 100644
index d917d11..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
+++ /dev/null
@@ -1,100 +0,0 @@
-package org.apache.mahout.math.decompositions
-
-import org.apache.mahout.math.{Matrices, Matrix, Vector}
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.drm._
-import RLikeDrmOps._
-import org.apache.mahout.common.RandomUtils
-import org.apache.mahout.logging._
-
-object DSSVD {
-
- private final implicit val log = getLog(DSSVD.getClass)
-
- /**
- * Distributed Stochastic Singular Value decomposition algorithm.
- *
- * @param drmA input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations
- * @return (U,V,s). Note that U, V are non-checkpointed matrices (i.e. one needs to actually use them
- * e.g. save them to hdfs in order to trigger their computation.
- */
- def dssvd[K](drmA: DrmLike[K],
- k: Int,
- p: Int = 15,
- q: Int = 0,
- cacheHint: CacheHint.CacheHint = CacheHint.MEMORY_ONLY):
-
- (DrmLike[K], DrmLike[Int], Vector) = {
-
- // Some mapBlock() calls need it
- implicit val ktag = drmA.keyClassTag
-
- val drmAcp = drmA.checkpoint(cacheHint)
-
- val m = drmAcp.nrow
- val n = drmAcp.ncol
- assert(k <= (m min n), "k cannot be greater than smaller of m, n.")
- val pfxed = safeToNonNegInt((m min n) - k min p)
-
- // Actual decomposition rank
- val r = k + pfxed
-
- // We represent Omega by its seed.
- val omegaSeed = RandomUtils.getRandom().nextInt()
-
- // Compute Y = A*Omega. Instead of redistributing view, we redistribute the Omega seed only and
- // instantiate the Omega random matrix view in the backend instead. That way serialized closure
- // is much more compact.
- var drmY = drmAcp.mapBlock(ncol = r) {
- case (keys, blockA) ⇒
- val blockY = blockA %*% Matrices.symmetricUniformView(n, r, omegaSeed)
- keys → blockY
- }.checkpoint(cacheHint)
-
- var drmQ = dqrThin(drmY)._1
- // Checkpoint Q if last iteration
- if (q == 0) drmQ = drmQ.checkpoint(cacheHint)
-
- trace(s"dssvd:drmQ=${drmQ.collect}.")
-
- // This actually should be optimized as identically partitioned map-side A'B since A and Q should
- // still be identically partitioned.
- var drmBt = drmAcp.t %*% drmQ
- // Checkpoint B' if last iteration
- if (q == 0) drmBt = drmBt.checkpoint(cacheHint)
-
- trace(s"dssvd:drmB'=${drmBt.collect}.")
-
- for (i ← 0 until q) {
- drmY = drmAcp %*% drmBt
- drmQ = dqrThin(drmY.checkpoint(cacheHint))._1
- // Checkpoint Q if last iteration
- if (i == q - 1) drmQ = drmQ.checkpoint(cacheHint)
-
- // This on the other hand should be inner-join-and-map A'B optimization since A and Q_i are not
- // identically partitioned anymore.`
- drmBt = drmAcp.t %*% drmQ
- // Checkpoint B' if last iteration
- if (i == q - 1) drmBt = drmBt.checkpoint(cacheHint)
- }
-
- val mxBBt:Matrix = drmBt.t %*% drmBt
-
- trace(s"dssvd: BB'=$mxBBt.")
-
- val (inCoreUHat, d) = eigen(mxBBt)
- val s = d.sqrt
-
- // Since neither drmU nor drmV are actually computed until actually used, we don't need the flags
- // instructing compute (or not compute) either of the U,V outputs anymore. Neat, isn't it?
- val drmU = drmQ %*% inCoreUHat
- val drmV = drmBt %*% (inCoreUHat %*%: diagv(1 /: s))
-
- (drmU(::, 0 until k), drmV(::, 0 until k), s(0 until k))
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
deleted file mode 100644
index fba9517..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.decompositions
-
-import scala.math._
-import org.apache.mahout.math.{Matrices, Matrix}
-import org.apache.mahout.common.RandomUtils
-import org.apache.log4j.Logger
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-
-private[math] object SSVD {
-
- private val log = Logger.getLogger(SSVD.getClass)
-
- /**
- * In-core SSVD algorithm.
- *
- * @param a input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations
- * @return (U,V,s)
- */
- def ssvd(a: Matrix, k: Int, p: Int = 15, q: Int = 0) = {
- val m = a.nrow
- val n = a.ncol
- if (k > min(m, n))
- throw new IllegalArgumentException(
- "k cannot be greater than smaller of m,n")
- val pfxed = min(p, min(m, n) - k)
-
- // Actual decomposition rank
- val r = k + pfxed
-
- val rnd = RandomUtils.getRandom
- val omega = Matrices.symmetricUniformView(n, r, rnd.nextInt)
-
- var y = a %*% omega
- var yty = y.t %*% y
- val at = a.t
- var ch = chol(yty)
- assert(ch.isPositiveDefinite, "Rank-deficiency detected during s-SVD")
- var bt = ch.solveRight(at %*% y)
-
- // Power iterations
- for (i ← 0 until q) {
- y = a %*% bt
- yty = y.t %*% y
- ch = chol(yty)
- bt = ch.solveRight(at %*% y)
- }
-
- val bbt = bt.t %*% bt
- val (uhat, d) = eigen(bbt)
-
- val s = d.sqrt
- val u = ch.solveRight(y) %*% uhat
- val v = bt %*% (uhat %*% diagv(1 /: s))
-
- (u(::, 0 until k), v(::, 0 until k), s(0 until k))
- }
-
- /**
- * PCA based on SSVD that runs without forming an always-dense A-(colMeans(A)) input for SVD. This
- * follows the solution outlined in MAHOUT-817. For in-core version it, for most part, is supposed
- * to save some memory for sparse inputs by removing direct mean subtraction.
- *
- * Hint: Usually one wants to use AV which is approsimately USigma, i.e.<code>u %*%: diagv(s)</code>.
- * If retaining distances and orignal scaled variances not that important, the normalized PCA space
- * is just U.
- *
- * Important: data points are considered to be rows.
- *
- * @param a input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations
- * @return (U,V,s)
- */
- def spca(a:Matrix, k: Int, p: Int = 15, q: Int = 0) = {
- val m = a.nrow
- val n = a.ncol
- if (k > min(m, n))
- throw new IllegalArgumentException(
- "k cannot be greater than smaller of m,n")
- val pfxed = min(p, min(m, n) - k)
-
- // Actual decomposition rank
- val r = k + pfxed
-
- val rnd = RandomUtils.getRandom
- val omega = Matrices.symmetricUniformView(n, r, rnd.nextInt)
-
- // Dataset mean
- val mu = a.colMeans()
- val mtm = mu dot mu
-
- if (log.isDebugEnabled) log.debug("xi=%s".format(mu))
-
- var y = a %*% omega
-
- // Fixing y
- val s_o = omega.t %*% mu
- y := ((r,c,v) ⇒ v - s_o(c))
-
- var yty = y.t %*% y
- var ch = chol(yty)
-// assert(ch.isPositiveDefinite, "Rank-deficiency detected during s-SVD")
-
- // This is implicit Q of QR(Y)
- var qm = ch.solveRight(y)
- var bt = a.t %*% qm
- var s_q = qm.colSums()
- var s_b = bt.t %*% mu
-
- // Power iterations
- for (i ← 0 until q) {
-
- // Fix bt
- bt -= mu cross s_q
-
- y = a %*% bt
-
- // Fix Y again.
- val st_b = s_b -=: mtm * s_q
- y := ((r,c,v) ⇒ v - st_b(c))
-
- yty = y.t %*% y
- ch = chol(yty)
- qm = ch.solveRight(y)
- bt = a.t %*% qm
- s_q = qm.colSums()
- s_b = bt.t %*% mu
- }
-
- val c = s_q cross s_b
-
- // BB' computation becomes
- val bbt = bt.t %*% bt -= c -= c.t += (mtm * s_q cross s_q)
-
- val (uhat, d) = eigen(bbt)
-
- val s = d.sqrt
- val u = qm %*% uhat
- val v = bt %*% (uhat %*%: diagv(1 /: s))
-
- (u(::, 0 until k), v(::, 0 until k), s(0 until k))
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/decompositions/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/package.scala
deleted file mode 100644
index a7b829f..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/package.scala
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm.DrmLike
-
-/**
- * This package holds all decomposition and factorization-like methods, all that we were able to make
- * distributed engine-independent so far, anyway.
- */
-package object decompositions {
-
- // ================ In-core decompositions ===================
-
- /**
- * In-core SSVD algorithm.
- *
- * @param a input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations
- * @return (U,V,s)
- */
- def ssvd(a: Matrix, k: Int, p: Int = 15, q: Int = 0) = SSVD.ssvd(a, k, p, q)
-
- /**
- * PCA based on SSVD that runs without forming an always-dense A-(colMeans(A)) input for SVD. This
- * follows the solution outlined in MAHOUT-817. For in-core version it, for most part, is supposed
- * to save some memory for sparse inputs by removing direct mean subtraction.
- *
- * Hint: Usually one wants to use AV which is approsimately USigma, i.e.<code>u %*%: diagv(s)</code>.
- * If retaining distances and orignal scaled variances not that important, the normalized PCA space
- * is just U.
- *
- * Important: data points are considered to be rows.
- *
- * @param a input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations
- * @return (U,V,s)
- */
- def spca(a: Matrix, k: Int, p: Int = 15, q: Int = 0) =
- SSVD.spca(a = a, k = k, p = p, q = q)
-
- // ============== Distributed decompositions ===================
-
- /**
- * Distributed _thin_ QR. A'A must fit in a memory, i.e. if A is m x n, then n should be pretty
- * controlled (<5000 or so). 
- *
- * It is recommended to checkpoint A since it does two passes over it. 
- *
- * It also guarantees that Q is partitioned exactly the same way (and in same key-order) as A, so
- * their RDD should be able to zip successfully.
- */
- def dqrThin[K: ClassTag](drmA: DrmLike[K], checkRankDeficiency: Boolean = true): (DrmLike[K], Matrix) =
- DQR.dqrThin(drmA, checkRankDeficiency)
-
- /**
- * Distributed Stochastic Singular Value decomposition algorithm.
- *
- * @param drmA input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations
- * @return (U,V,s). Note that U, V are non-checkpointed matrices (i.e. one needs to actually use them
- * e.g. save them to hdfs in order to trigger their computation.
- */
- def dssvd[K: ClassTag](drmA: DrmLike[K], k: Int, p: Int = 15, q: Int = 0):
- (DrmLike[K], DrmLike[Int], Vector) = DSSVD.dssvd(drmA, k, p, q)
-
- /**
- * Distributed Stochastic PCA decomposition algorithm. A logical reflow of the "SSVD-PCA options.pdf"
- * document of the MAHOUT-817.
- *
- * @param drmA input matrix A
- * @param k request SSVD rank
- * @param p oversampling parameter
- * @param q number of power iterations (hint: use either 0 or 1)
- * @return (U,V,s). Note that U, V are non-checkpointed matrices (i.e. one needs to actually use them
- * e.g. save them to hdfs in order to trigger their computation.
- */
- def dspca[K: ClassTag](drmA: DrmLike[K], k: Int, p: Int = 15, q: Int = 0):
- (DrmLike[K], DrmLike[Int], Vector) = DSPCA.dspca(drmA, k, p, q)
-
- /** Result for distributed ALS-type two-component factorization algorithms */
- type FactorizationResult[K] = ALS.Result[K]
-
- /** Result for distributed ALS-type two-component factorization algorithms, in-core matrices */
- type FactorizationResultInCore = ALS.InCoreResult
-
- /**
- * Run ALS.
- * 
- *
- * Example:
- *
- * <pre>
- * val (u,v,errors) = als(input, k).toTuple
- * </pre>
- *
- * ALS runs until (rmse[i-1]-rmse[i])/rmse[i-1] < convergenceThreshold, or i==maxIterations,
- * whichever earlier.
- * 
- *
- * @param drmA The input matrix
- * @param k required rank of decomposition (number of cols in U and V results)
- * @param convergenceThreshold stop sooner if (rmse[i-1] - rmse[i])/rmse[i - 1] is less than this
- * value. If <=0 then we won't compute RMSE and use convergence test.
- * @param lambda regularization rate
- * @param maxIterations maximum iterations to run regardless of convergence
- * @tparam K row key type of the input (100 is probably more than enough)
- * @return { @link org.apache.mahout.math.drm.decompositions.ALS.Result}
- */
- def dals[K: ClassTag](
- drmA: DrmLike[K],
- k: Int = 50,
- lambda: Double = 0.0,
- maxIterations: Int = 10,
- convergenceThreshold: Double = 0.10
- ): FactorizationResult[K] =
- ALS.dals(drmA, k, lambda, maxIterations, convergenceThreshold)
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
deleted file mode 100644
index b86e286..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-/** Broadcast variable abstraction */
-trait BCast[T] extends java.io.Closeable {
- def value:T
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/CacheHint.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/CacheHint.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/CacheHint.scala
deleted file mode 100644
index 3755f31..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/CacheHint.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-object CacheHint extends Enumeration {
-
- type CacheHint = Value
-
- val NONE,
- DISK_ONLY,
- DISK_ONLY_2,
- MEMORY_ONLY,
- MEMORY_ONLY_2,
- MEMORY_ONLY_SER,
- MEMORY_ONLY_SER_2,
- MEMORY_AND_DISK,
- MEMORY_AND_DISK_2,
- MEMORY_AND_DISK_SER,
- MEMORY_AND_DISK_SER_2 = Value
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedDrm.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedDrm.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedDrm.scala
deleted file mode 100644
index 31f8097..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedDrm.scala
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import org.apache.mahout.math.Matrix
-import org.apache.mahout.math.drm.CacheHint.CacheHint
-
-/**
- * Checkpointed DRM API. This is a matrix that has optimized RDD lineage behind it and can be
- * therefore collected or saved.
- *
- * @tparam K matrix key type (e.g. the keys of sequence files once persisted)
- */
-trait CheckpointedDrm[K] extends DrmLike[K] {
-
- def collect: Matrix
-
- def dfsWrite(path: String)
-
- val cacheHint: CacheHint
-
- /** If this checkpoint is already declared cached, uncache. */
- def uncache(): this.type
-
- /** changes the number of rows without touching the underlying data */
- def newRowCardinality(n: Int): CheckpointedDrm[K]
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
deleted file mode 100644
index 37cd981..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import org.apache.mahout.math._
-
-import org.apache.mahout.math.scalabindings.RLikeOps._
-
-/**
- * Additional experimental operations over CheckpointedDRM implementation. I will possibly move them up to
- * the DRMBase once they stabilize.
- *
- */
-class CheckpointedOps[K](val drm: CheckpointedDrm[K]) {
-
-
- /** Column sums. At this point this runs on checkpoint and collects in-core vector. */
- def colSums(): Vector = drm.context.colSums(drm)
-
- /** Column clounts. Counts the non-zero values. At this point this runs on checkpoint and collects in-core vector. */
- def numNonZeroElementsPerColumn(): Vector = drm.context.numNonZeroElementsPerColumn(drm)
-
- /** Column Means */
- def colMeans(): Vector = drm.context.colMeans(drm)
-
- /** Optional engine-specific all reduce tensor operation. */
- def allreduceBlock(bmf: BlockMapFunc2[K], rf: BlockReduceFunc = _ += _): Matrix =
-
- drm.context.allreduceBlock(drm, bmf, rf)
-
- /** Second norm */
- def norm():Double = drm.context.norm(drm)
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedContext.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedContext.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedContext.scala
deleted file mode 100644
index e1833d8..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedContext.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import java.io.Closeable
-
-/** Distributed context (a.k.a. distributed session handle) */
-trait DistributedContext extends Closeable {
-
- val engine: DistributedEngine
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
deleted file mode 100644
index c27e8dd..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import org.apache.mahout.math.indexeddataset._
-
-import logical._
-import org.apache.mahout.math._
-import scalabindings._
-import RLikeOps._
-import DistributedEngine._
-import org.apache.log4j.Logger
-
-import scala.reflect.ClassTag
-
-/** Abstraction of optimizer/distributed engine */
-trait DistributedEngine {
-
- /**
- * First optimization pass. Return physical plan that we can pass to exec(). This rewrite may
- * introduce logical constructs (including engine-specific ones) that user DSL cannot even produce
- * per se.
- * 
- *
- * A particular physical engine implementation may choose to either use the default rewrites or
- * build its own rewriting rules.
- * 
- */
- def optimizerRewrite[K: ClassTag](action: DrmLike[K]): DrmLike[K] = pass3(pass2(pass1(action)))
-
- /** Second optimizer pass. Translate previously rewritten logical pipeline into physical engine plan. */
- def toPhysical[K: ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K]
-
- /** Engine-specific colSums implementation based on a checkpoint. */
- def colSums[K](drm: CheckpointedDrm[K]): Vector
-
- /** Optional engine-specific all reduce tensor operation. */
- def allreduceBlock[K](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf: BlockReduceFunc): Matrix
-
- /** Engine-specific numNonZeroElementsPerColumn implementation based on a checkpoint. */
- def numNonZeroElementsPerColumn[K](drm: CheckpointedDrm[K]): Vector
-
- /** Engine-specific colMeans implementation based on a checkpoint. */
- def colMeans[K](drm: CheckpointedDrm[K]): Vector
-
- def norm[K](drm: CheckpointedDrm[K]): Double
-
- /** Broadcast support */
- def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector]
-
- /** Broadcast support */
- def drmBroadcast(m: Matrix)(implicit dc: DistributedContext): BCast[Matrix]
-
- /**
- * Load DRM from hdfs (as in Mahout DRM format).
- * 
- * @param path The DFS path to load from
- * @param parMin Minimum parallelism after load (equivalent to #par(min=...)).
- */
- def drmDfsRead(path: String, parMin: Int = 0)(implicit sc: DistributedContext): CheckpointedDrm[_]
-
- /** Parallelize in-core matrix as the backend engine distributed matrix, using row ordinal indices as data set keys. */
- def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)(implicit sc: DistributedContext):
- CheckpointedDrm[Int]
-
- /** Parallelize in-core matrix as the backend engine distributed matrix, using row labels as a data set keys. */
- def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)(implicit sc: DistributedContext):
- CheckpointedDrm[String]
-
- /** This creates an empty DRM with specified number of partitions and cardinality. */
- def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)(implicit sc: DistributedContext):
- CheckpointedDrm[Int]
-
- /** Creates empty DRM with non-trivial height */
- def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)(implicit sc: DistributedContext):
- CheckpointedDrm[Long]
-
- /**
- * Convert non-int-keyed matrix to an int-keyed, computing optionally mapping from old keys
- * to row indices in the new one. The mapping, if requested, is returned as a 1-column matrix.
- */
- def drm2IntKeyed[K](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]])
-
- /**
- * (Optional) Sampling operation. Consistent with Spark semantics of the same.
- * @param drmX
- * @param fraction
- * @param replacement
- * @tparam K
- * @return
- */
- def drmSampleRows[K](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K]
-
- def drmSampleKRows[K](drmX: DrmLike[K], numSamples:Int, replacement:Boolean = false) : Matrix
-
- /**
- * Load IndexedDataset from text delimited format.
- * @param src comma delimited URIs to read from
- * @param schema defines format of file(s)
- */
- def indexedDatasetDFSRead(src: String,
- schema: Schema = DefaultIndexedDatasetReadSchema,
- existingRowIDs: Option[BiDictionary] = None)
- (implicit sc: DistributedContext):
- IndexedDataset
-
- /**
- * Load IndexedDataset from text delimited format, one element per line
- * @param src comma delimited URIs to read from
- * @param schema defines format of file(s)
- */
- def indexedDatasetDFSReadElements(src: String,
- schema: Schema = DefaultIndexedDatasetElementReadSchema,
- existingRowIDs: Option[BiDictionary] = None)
- (implicit sc: DistributedContext):
- IndexedDataset
-
-}
-
-object DistributedEngine {
-
- private val log = Logger.getLogger(DistributedEngine.getClass)
-
- /** This is mostly multiplication operations rewrites */
- private def pass1[K](action: DrmLike[K]): DrmLike[K] = {
-
- action match {
-
- // Logical but previously had checkpoint attached to it already that has some caching policy to it
- case cpa: CheckpointAction[K] if cpa.cp.exists(_.cacheHint != CacheHint.NONE) ⇒ cpa.cp.get
-
- // self element-wise rewrite
- case OpAewB(a, b, op) if a == b => {
- op match {
- case "*" ⇒ OpAewUnaryFunc(pass1(a), (x) ⇒ x * x)
- case "/" ⇒ OpAewUnaryFunc(pass1(a), (x) ⇒ x / x)
- // Self "+" and "-" don't make a lot of sense, but we do include it for completeness.
- case "+" ⇒ OpAewUnaryFunc(pass1(a), 2.0 * _)
- case "-" ⇒ OpAewUnaryFunc(pass1(a), (_) ⇒ 0.0)
- case _ ⇒
- require(false, s"Unsupported operator $op")
- null
- }
- }
- case OpAB(OpAt(a), b) if a == b ⇒ OpAtA(pass1(a))
- case OpABAnyKey(OpAtAnyKey(a), b) if a == b ⇒ OpAtA(pass1(a))
-
- // A small rule change: Now that we have removed ClassTag at the %*% operation, it doesn't
- // match b[Int] case automatically any longer. So, we need to check and rewrite it dynamically
- // and re-run pass1 again on the obtained tree.
- case OpABAnyKey(a, b) if b.keyClassTag == ClassTag.Int ⇒ pass1(OpAB(a, b.asInstanceOf[DrmLike[Int]]))
- case OpAtAnyKey(a) if a.keyClassTag == ClassTag.Int ⇒ pass1(OpAt(a.asInstanceOf[DrmLike[Int]]))
-
- // For now, rewrite left-multiply via transpositions, i.e.
- // inCoreA %*% B = (B' %*% inCoreA')'
- case ***@OpTimesLeftMatrix(a, b) ⇒
- OpAt(OpTimesRightMatrix(A = OpAt(pass1(b)), right = a.t))
-
- // Add vertical row index concatenation for rbind() on DrmLike[Int] fragments
- case ***@OpRbind(a, b) if op.keyClassTag == ClassTag.Int ⇒
-
- // Make sure closure sees only local vals, not attributes. We need to do these ugly casts
- // around because compiler could not infer that K is the same as Int, based on if() above.
- val ma = safeToNonNegInt(a.nrow)
- val bAdjusted = new OpMapBlock[Int, Int](A = pass1(b.asInstanceOf[DrmLike[Int]]), bmf = {
- case (keys, block) ⇒ keys.map(_ + ma) → block
- }, identicallyPartitioned = false)
- val aAdjusted = a.asInstanceOf[DrmLike[Int]]
- OpRbind(pass1(aAdjusted), bAdjusted).asInstanceOf[DrmLike[K]]
-
- // Stop at checkpoints
- case cd: CheckpointedDrm[_] ⇒ action
-
- // For everything else we just pass-thru the operator arguments to optimizer
- case uop: AbstractUnaryOp[_, K] ⇒
- uop.A = pass1(uop.A)
- uop
-
- case bop: AbstractBinaryOp[_, _, K] ⇒
- bop.A = pass1(bop.A)
- bop.B = pass1(bop.B)
- bop
- }
- }
-
- /** This would remove stuff like A.t.t that previous step may have created */
- private def pass2[K](action: DrmLike[K]): DrmLike[K] = {
- action match {
-
- // Fusion of unary funcs into single, like 1 + x * x.
- // Since we repeating the pass over self after rewrite, we dont' need to descend into arguments
- // recursively here.
- case ***@OpAewUnaryFunc(***@OpAewUnaryFunc(a, _, _), _, _) ⇒
- pass2(OpAewUnaryFuncFusion(a, op1 :: op2 :: Nil))
-
- // Fusion one step further, like 1 + 2 * x * x. All should be rewritten as one UnaryFuncFusion.
- // Since we repeating the pass over self after rewrite, we dont' need to descend into arguments
- // recursively here.
- case ***@OpAewUnaryFuncFusion(***@OpAewUnaryFunc(a, _, _), _) ⇒
- pass2(OpAewUnaryFuncFusion(a, op.ff :+ op2))
-
- // A.t.t => A
- case OpAt(***@OpAt(a)) ⇒ pass2(a)
-
- // Stop at checkpoints
- case cd: CheckpointedDrm[_] ⇒ action
-
- // For everything else we just pass-thru the operator arguments to optimizer
- case uop: AbstractUnaryOp[_, K] ⇒
- uop.A = pass2(uop.A)
- uop
- case bop: AbstractBinaryOp[_, _, K] ⇒
- bop.A = pass2(bop.A)
- bop.B = pass2(bop.B)
- bop
- }
- }
-
- /** Some further rewrites that are conditioned on A.t.t removal */
- private def pass3[K](action: DrmLike[K]): DrmLike[K] = {
- action match {
-
- // matrix products.
- case OpAB(a, OpAt(b)) ⇒ OpABt(pass3(a), pass3(b))
-
- // AtB cases that make sense.
- case OpAB(OpAt(a), b) if a.partitioningTag == b.partitioningTag ⇒ OpAtB(pass3(a), pass3(b))
- case OpABAnyKey(OpAtAnyKey(a), b) ⇒ OpAtB(pass3(a), pass3(b))
-
- // Need some cost to choose between the following.
-
- case OpAB(OpAt(a), b) ⇒ OpAtB(pass3(a), pass3(b))
- // case OpAB(OpAt(a), b) => OpAt(OpABt(OpAt(pass1(b)), pass1(a)))
- case OpAB(a, b) ⇒ OpABt(pass3(a), OpAt(pass3(b)))
-
- // Rewrite A'x
- case ***@OpAx(***@OpAt(a), x) ⇒ OpAtx(pass3(a), x)
-
- // Stop at checkpoints
- case cd: CheckpointedDrm[_] ⇒ action
-
- // For everything else we just pass-thru the operator arguments to optimizer
- case uop: AbstractUnaryOp[_, K] ⇒
- uop.A = pass3(uop.A)
- uop
- case bop: AbstractBinaryOp[_, _, K] ⇒
- bop.A = pass3(bop.A)
- bop.B = pass3(bop.B)
- bop
- }
- }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
deleted file mode 100644
index de03776..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.drm.logical.OpCbindScalar
-
-import scala.reflect.ClassTag
-
-class DrmDoubleScalarOps(val x:Double) extends AnyVal{
-
- def +[K:ClassTag](that:DrmLike[K]) = that + x
-
- def *[K:ClassTag](that:DrmLike[K]) = that * x
-
- def -[K:ClassTag](that:DrmLike[K]) = x -: that
-
- def /[K:ClassTag](that:DrmLike[K]) = x /: that
-
- def cbind[K: ClassTag](that: DrmLike[K]) = OpCbindScalar(A = that, x = x, leftBind = true)
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLike.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLike.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLike.scala
deleted file mode 100644
index 23f5fc6..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLike.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import scala.reflect.ClassTag
-
-/**
- *
- * Basic DRM trait.
- *
- * Since we already call the package "sparkbindings", I will not use stem "spark" with classes in
- * this package. Spark backing is already implied.
- *
- */
-trait DrmLike[K] {
-
- protected[mahout] def partitioningTag: Long
-
- protected[mahout] def canHaveMissingRows: Boolean
-
- /**
- * Distributed context, can be implicitly converted to operations on [[org.apache.mahout.math.drm.
- * DistributedEngine]].
- */
- val context:DistributedContext
-
- /** R-like syntax for number of rows. */
- def nrow: Long
-
- /** R-like syntax for number of columns */
- def ncol: Int
-
- /**
- * Explicit extraction of key class Tag since traits don't support context bound access; but actual
- * implementation knows it
- */
- def keyClassTag: ClassTag[K]
-
- /**
- * Action operator -- does not necessary means Spark action; but does mean running BLAS optimizer
- * and writing down Spark graph lineage since last checkpointed DRM.
- */
- def checkpoint(cacheHint: CacheHint.CacheHint = CacheHint.MEMORY_ONLY): CheckpointedDrm[K]
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
deleted file mode 100644
index 43b4f56..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import scala.reflect.ClassTag
-import org.apache.mahout.math.scalabindings._
-import org.apache.mahout.math.drm.logical.{OpAewUnaryFunc, OpPar, OpMapBlock, OpRowRange}
-
-/** Common Drm ops */
-class DrmLikeOps[K](protected[drm] val drm: DrmLike[K]) {
-
- /**
- * Parallelism adjustments. 
- *
- * Change only one of parameters from default value to choose new parallelism adjustment strategy.
- * 
- *
- * E.g. use
- * <pre>
- * drmA.par(auto = true)
- * </pre>
- * to use automatic parallelism adjustment.
- * 
- *
- * Parallelism here in API is fairly abstract concept, and actual value interpretation is left for
- * a particular backend strategy. However, it is usually equivalent to number of map tasks or data
- * splits.
- * 
- *
- * @param min If changed from default, ensures the product has at least that much parallelism.
- * @param exact if changed from default, ensures the pipeline product has exactly that much
- * parallelism.
- * @param auto If changed from default, engine-specific automatic parallelism adjustment strategy
- * is applied.
- */
- def par(min: Int = -1, exact: Int = -1, auto: Boolean = false) = {
- require(min > 0 || exact > 0 || auto, "Invalid argument")
- OpPar(drm, minSplits = min, exactSplits = exact)
- }
-
- /**
- * Map matrix block-wise vertically. Blocks of the new matrix can be modified original block
- * matrices; or they could be completely new matrices with new keyset. In the latter case, output
- * matrix width must be specified with <code>ncol</code> parameter.
- *
- * New block heights must be of the same height as the original geometry.
- *
- * @param ncol new matrix' width (only needed if width changes).
- * @param bmf
- * @tparam R
- * @return
- */
- def mapBlock[R: ClassTag](ncol: Int = -1, identicallyPartitioned: Boolean = true)
- (bmf: BlockMapFunc[K, R]): DrmLike[R] =
- new OpMapBlock[K, R](
- A = drm,
- bmf = bmf,
- _ncol = ncol,
- identicallyPartitioned = identicallyPartitioned
- )
-
- /**
- * Slicing the DRM. Should eventually work just like in-core drm (e.g. A(0 until 5, 5 until 15)).
- *
- * The all-range is denoted by '::', e.g.: A(::, 0 until 5).
- *
- * Row range is currently unsupported except for the all-range. When it will be fully supported,
- * the input must be Int-keyed, i.e. of DrmLike[Int] type for non-all-range specifications.
- *
- * @param rowRange Row range. This must be '::' (all-range) unless matrix rows are keyed by Int key.
- * @param colRange col range. Must be a sub-range of <code>0 until ncol</code>. '::' denotes all-range.
- */
- def apply(rowRange: Range, colRange: Range): DrmLike[K] = {
-
- import RLikeDrmOps._
- import RLikeOps._
-
- implicit val ktag = drm.keyClassTag
-
- val rowSrc: DrmLike[K] = if (rowRange != ::) {
-
- if (ClassTag.Int == ktag) {
-
- assert(rowRange.head >= 0 && rowRange.last < drm.nrow, "rows range out of range")
- val intKeyed = drm.asInstanceOf[DrmLike[Int]]
-
- new OpRowRange(A = intKeyed, rowRange = rowRange).asInstanceOf[DrmLike[K]]
-
- } else throw new IllegalArgumentException("non-all row range is only supported for Int-keyed DRMs.")
-
- } else drm
-
- if (colRange != ::) {
-
- assert(colRange.head >= 0 && colRange.last < drm.ncol, "col range out of range")
-
- // Use mapBlock operator to do in-core subranging.
- rowSrc.mapBlock(ncol = colRange.length)({
- case (keys, block) => keys -> block(::, colRange)
- })
-
- } else rowSrc
- }
-
- /**
- * Apply a function element-wise.
- *
- * @param f element-wise function
- * @param evalZeros Do we have to process zero elements? true, false, auto: if auto, we will test
- * the supplied function for `f(0) != 0`, and depending on the result, will
- * decide if we want evaluation for zero elements. WARNING: the AUTO setting
- * may not always work correctly for functions that are meant to run in a specific
- * backend context, or non-deterministic functions, such as {-1,0,1} random
- * generators.
- * @return new DRM with the element-wise function applied.
- */
- def apply(f: Double ⇒ Double, evalZeros: AutoBooleanEnum.T = AutoBooleanEnum.AUTO) = {
- val ezeros = evalZeros match {
- case AutoBooleanEnum.TRUE ⇒ true
- case AutoBooleanEnum.FALSE ⇒ false
- case AutoBooleanEnum.AUTO ⇒ f(0) != 0
- }
- new OpAewUnaryFunc[K](drm, f, ezeros)
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
deleted file mode 100644
index 8bea741..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.drm
-
-import scala.reflect.ClassTag
-import collection._
-import JavaConversions._
-import org.apache.mahout.math.{Vector, Matrix}
-import org.apache.mahout.math.drm.logical._
-import org.apache.mahout.math.scalabindings._
-import RLikeOps._
-
-class RLikeDrmOps[K](drm: DrmLike[K]) extends DrmLikeOps[K](drm) {
-
- import RLikeDrmOps._
- import org.apache.mahout.math.scalabindings._
-
- def +(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "+")
-
- def -(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "-")
-
- def *(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "*")
-
- def /(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "/")
-
- def +(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ + that, evalZeros = true)
-
- def +:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that + _, evalZeros = true)
-
- def -(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ - that, evalZeros = true)
-
- def -:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that - _, evalZeros = true)
-
- def *(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ * that)
-
- def *:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that * _)
-
- def ^(that: Double): DrmLike[K] = that match {
- // Special handling of x ^2 and x ^ 0.5: we want consistent handling of x ^ 2 and x * x since
- // pow(x,2) function return results different from x * x; but much of the code uses this
- // interchangeably. Not having this done will create things like NaN entries on main diagonal
- // of a distance matrix.
- case 2.0 ⇒ OpAewUnaryFunc[K](A = this, f = x ⇒ x * x)
- case 0.5 ⇒ OpAewUnaryFunc[K](A = this, f = math.sqrt _)
- case _ ⇒ OpAewUnaryFunc[K](A = this, f = math.pow(_, that))
- }
-
- def /(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ / that, evalZeros = that == 0.0)
-
- def /:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that / _, evalZeros = true)
-
- def :%*%[B](that: DrmLike[B]): DrmLike[K] = OpABAnyKey[B,K](A = this.drm, B=that)
-
- def %*%[B](that: DrmLike[B]): DrmLike[K] = this :%*% that
-
- def :%*%(that: Matrix): DrmLike[K] = OpTimesRightMatrix[K](A = this.drm, right = that)
-
- def %*%(that: Matrix): DrmLike[K] = this :%*% that
-
- def :%*%(that: Vector): DrmLike[K] = OpAx(A = this.drm, x = that)
-
- def %*%(that: Vector): DrmLike[K] = :%*%(that)
-
- def t: DrmLike[Int] = OpAtAnyKey(A = drm)
-
- def cbind(that: DrmLike[K]): DrmLike[K] = OpCbind(A = this.drm, B = that)
-
- def cbind(that: Double): DrmLike[K] = OpCbindScalar(A = this.drm, x = that, leftBind = false)
-
- def rbind(that: DrmLike[K]): DrmLike[K] = OpRbind(A = this.drm, B = that)
-
- /**
- * `rowSums` method for non-int keyed matrices.
- *
- * Slight problem here is the limitation of in-memory representation of Colt's Matrix, which can
- * only have String row labels. Therefore, internally we do ".toString()" call on each key object,
- * and then put it into [[Matrix]] row label bindings, at which point they are coerced to be Strings.
- *
- * This is obviously a suboptimal behavior, so as TODO we have here future enhancements of `collect'.
- *
- * @return map of row keys into row sums, front-end collected.
- */
- def rowSumsMap(): Map[String, Double] = {
-
- implicit val ktag = drm.keyClassTag
-
- val m = drm.mapBlock(ncol = 1) { case (keys, block) =>
- keys -> dense(block.rowSums).t
- }.collect
- m.getRowLabelBindings.map { case (key, idx) => key -> m(idx, 0)}
- }
-}
-
-class RLikeDrmIntOps(drm: DrmLike[Int]) extends RLikeDrmOps[Int](drm) {
-
- import org.apache.mahout.math._
- import scalabindings._
- import RLikeDrmOps._
-
- override def t: DrmLike[Int] = OpAt(A = drm)
-
- def %*%:[K: ClassTag](that: DrmLike[K]): DrmLike[K] = OpAB[K](A = that, B = this.drm)
-
- def %*%:(that: Matrix): DrmLike[Int] = OpTimesLeftMatrix(left = that, A = this.drm)
-
- /** Row sums. This is of course applicable to Int-keyed distributed matrices only. */
- def rowSums(): Vector = {
- drm.mapBlock(ncol = 1) { case (keys, block) =>
- // Collect block-wise rowsums and output them as one-column matrix.
- keys -> dense(block.rowSums).t
- }
- .collect(::, 0)
- }
-
- /** Counts the non-zeros elements in each row returning a vector of the counts */
- def numNonZeroElementsPerRow(): Vector = {
- drm.mapBlock(ncol = 1) { case (keys, block) =>
- // Collect block-wise row non-zero counts and output them as a one-column matrix.
- keys -> dense(block.numNonZeroElementsPerRow).t
- }
- .collect(::, 0)
- }
-
- /** Row means */
- def rowMeans(): Vector = {
- drm.mapBlock(ncol = 1) { case (keys, block) =>
- // Collect block-wise row means and output them as one-column matrix.
- keys -> dense(block.rowMeans).t
- }
- .collect(::, 0)
- }
-
- /** Return diagonal vector */
- def diagv: Vector = {
- require(drm.ncol == drm.nrow, "Must be square to extract diagonal")
- drm.mapBlock(ncol = 1) { case (keys, block) =>
- keys -> dense(for (r <- block.view) yield r(keys(r.index))).t
- }
- .collect(::, 0)
- }
-
-}
-
-object RLikeDrmOps {
-
- implicit def double2ScalarOps(x: Double) = new DrmDoubleScalarOps(x)
-
- implicit def drmInt2RLikeOps(drm: DrmLike[Int]): RLikeDrmIntOps = new RLikeDrmIntOps(drm)
-
- implicit def drm2RLikeOps[K](drm: DrmLike[K]): RLikeDrmOps[K] = new RLikeDrmOps[K](drm)
-
- implicit def rlikeOps2Drm[K](ops: RLikeDrmOps[K]): DrmLike[K] = ops.drm
-
- implicit def ops2Drm[K](ops: DrmLikeOps[K]): DrmLike[K] = ops.drm
-
- implicit def drm2cpops[K](drm: DrmLike[K]): CheckpointedOps[K] = new CheckpointedOps(drm)
-}

r***@apache.org

2018-06-27 14:51:53 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
deleted file mode 100644
index e06e8d6..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.IntField;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-
-/**
- * Document with numeric field.
- */
-@Deprecated
-public class NumericFieldDocument extends SingleFieldDocument {
-
- public static final String NUMERIC_FIELD = "numeric";
-
- private int numericField;
-
- public NumericFieldDocument(String id, String field, int numericField) {
- super(id, field);
- this.numericField = numericField;
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = new Document();
-
- document.add(new StringField(ID_FIELD, getId(), Field.Store.YES));
- document.add(new TextField(FIELD, getField(), Field.Store.YES));
- document.add(new IntField(NUMERIC_FIELD, numericField, Field.Store.YES));
-
- return document;
- }
-
- public int getNumericField() {
- return numericField;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
deleted file mode 100644
index 4636a51..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-
-/**
- * Used for testing lucene2seq
- */
-@Deprecated
-public class SingleFieldDocument implements TestDocument {
-
- public static final String ID_FIELD = "idField";
- public static final String FIELD = "field";
-
- private String id;
- private String field;
-
- public SingleFieldDocument(String id, String field) {
- this.id = id;
- this.field = field;
- }
-
- @Override
- public String getId() {
- return id;
- }
-
- @Override
- public String getField() {
- return field;
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = new Document();
-
- Field idField = new StringField(ID_FIELD, getId(), Field.Store.YES);
- Field field = new TextField(FIELD, getField(), Field.Store.YES);
-
- document.add(idField);
- document.add(field);
-
- return document;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
deleted file mode 100644
index 7243c71..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-@Deprecated
-public interface TestDocument {
-
- String getId();
-
- String getField();
-
- Document asLuceneDocument();
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
deleted file mode 100644
index 6eb43f6..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-
-/**
- * Used for testing lucene2seq
- */
-@Deprecated
-public class UnstoredFieldsDocument extends SingleFieldDocument {
-
- public static final String UNSTORED_FIELD = "unstored";
-
- public UnstoredFieldsDocument(String id, String field) {
- super(id, field);
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = super.asLuceneDocument();
-
- document.add(new StringField(UNSTORED_FIELD, "", Field.Store.NO));
-
- return document;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java b/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
deleted file mode 100644
index 65b308f..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import com.google.common.collect.Lists;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.Iterator;
-
-public class Bump125Test extends MahoutTestCase {
- @Test
- public void testIncrement() throws Exception {
- Iterator<Integer> ref = Lists.newArrayList(1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 50, 60,
- 70, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350,
- 400, 500, 600, 700, 800, 1000, 1200, 1400, 1600, 1800,
- 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000)
- .iterator();
- Bump125 b = new Bump125();
- for (int i = 0; i < 50; i++) {
- long x = b.increment();
- assertEquals(ref.next().longValue(), x);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java b/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
deleted file mode 100644
index 7ffa690..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.Charset;
-
-import com.google.common.io.Closeables;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.classifier.ClassifierData;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class SplitInputTest extends MahoutTestCase {
-
- private OpenObjectIntHashMap<String> countMap;
- private Charset charset;
- private FileSystem fs;
- private Path tempInputFile;
- private Path tempTrainingDirectory;
- private Path tempTestDirectory;
- private Path tempMapRedOutputDirectory;
- private Path tempInputDirectory;
- private Path tempSequenceDirectory;
- private SplitInput si;
-
- @Override
- @Before
- public void setUp() throws Exception {
- Configuration conf = getConfiguration();
- fs = FileSystem.get(conf);
-
- super.setUp();
-
- countMap = new OpenObjectIntHashMap<>();
-
- charset = Charsets.UTF_8;
- tempSequenceDirectory = getTestTempFilePath("tmpsequence");
- tempInputFile = getTestTempFilePath("bayesinputfile");
- tempTrainingDirectory = getTestTempDirPath("bayestrain");
- tempTestDirectory = getTestTempDirPath("bayestest");
- tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput");
- tempInputDirectory = getTestTempDirPath("bayesinputdir");
-
- si = new SplitInput();
- si.setTrainingOutputDirectory(tempTrainingDirectory);
- si.setTestOutputDirectory(tempTestDirectory);
- si.setInputDirectory(tempInputDirectory);
- }
-
- private void writeMultipleInputFiles() throws IOException {
- Writer writer = null;
- String currentLabel = null;
- try {
- for (String[] entry : ClassifierData.DATA) {
- if (!entry[0].equals(currentLabel)) {
- currentLabel = entry[0];
- Closeables.close(writer, false);
-
- writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(tempInputDirectory, currentLabel)),
- Charsets.UTF_8));
- }
- countMap.adjustOrPutValue(currentLabel, 1, 1);
- writer.write(currentLabel + '\t' + entry[1] + '\n');
- }
- }finally {
- Closeables.close(writer, false);
- }
- }
-
- private void writeSingleInputFile() throws IOException {
- Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8));
- try {
- for (String[] entry : ClassifierData.DATA) {
- writer.write(entry[0] + '\t' + entry[1] + '\n');
- }
- } finally {
- Closeables.close(writer, true);
- }
- }
-
- @Test
- public void testSplitDirectory() throws Exception {
-
- writeMultipleInputFiles();
-
- final int testSplitSize = 1;
- si.setTestSplitSize(testSplitSize);
- si.setCallback(new SplitInput.SplitCallback() {
- @Override
- public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) {
- int trainingLines = countMap.get(inputFile.getName()) - testSplitSize;
- assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory);
- }
- });
-
- si.splitDirectory(tempInputDirectory);
- }
-
- @Test
- public void testSplitFile() throws Exception {
- writeSingleInputFile();
- si.setTestSplitSize(2);
- si.setCallback(new TestCallback(2, 10));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFileLocation() throws Exception {
- writeSingleInputFile();
- si.setTestSplitSize(2);
- si.setSplitLocation(50);
- si.setCallback(new TestCallback(2, 10));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFilePct() throws Exception {
- writeSingleInputFile();
- si.setTestSplitPct(25);
-
- si.setCallback(new TestCallback(3, 9));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFilePctLocation() throws Exception {
- writeSingleInputFile();
- si.setTestSplitPct(25);
- si.setSplitLocation(50);
- si.setCallback(new TestCallback(3, 9));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFileRandomSelectionSize() throws Exception {
- writeSingleInputFile();
- si.setTestRandomSelectionSize(5);
-
- si.setCallback(new TestCallback(5, 7));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFileRandomSelectionPct() throws Exception {
- writeSingleInputFile();
- si.setTestRandomSelectionPct(25);
-
- si.setCallback(new TestCallback(3, 9));
- si.splitFile(tempInputFile);
- }
-
- /**
- * Create a Sequencefile for testing consisting of IntWritable
- * keys and VectorWritable values
- * @param path path for test SequenceFile
- * @param testPoints number of records in test SequenceFile
- */
- private void writeVectorSequenceFile(Path path, int testPoints) throws IOException {
- Path tempSequenceFile = new Path(path, "part-00000");
- Configuration conf = getConfiguration();
- IntWritable key = new IntWritable();
- VectorWritable value = new VectorWritable();
- try (SequenceFile.Writer writer =
- SequenceFile.createWriter(fs, conf, tempSequenceFile, IntWritable.class, VectorWritable.class)) {
- for (int i = 0; i < testPoints; i++) {
- key.set(i);
- Vector v = new SequentialAccessSparseVector(4);
- v.assign(i);
- value.set(v);
- writer.append(key, value);
- }
- }
- }
-
- /**
- * Create a Sequencefile for testing consisting of IntWritable keys and Text values
- * @param path path for test SequenceFile
- * @param testPoints number of records in test SequenceFile
- */
- private void writeTextSequenceFile(Path path, int testPoints) throws IOException {
- Path tempSequenceFile = new Path(path, "part-00000");
- Configuration conf = getConfiguration();
- Text key = new Text();
- Text value = new Text();
- try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, tempSequenceFile, Text.class, Text.class)){
- for (int i = 0; i < testPoints; i++) {
- key.set(Integer.toString(i));
- value.set("Line " + i);
- writer.append(key, value);
- }
- }
- }
-
- /**
- * Display contents of a SequenceFile
- * @param sequenceFilePath path to SequenceFile
- */
- private void displaySequenceFile(Path sequenceFilePath) throws IOException {
- for (Pair<?,?> record : new SequenceFileIterable<>(sequenceFilePath, true, getConfiguration())) {
- System.out.println(record.getFirst() + "\t" + record.getSecond());
- }
- }
-
- /**
- * Determine number of records in a SequenceFile
- * @param sequenceFilePath path to SequenceFile
- * @return number of records
- */
- private int getNumberRecords(Path sequenceFilePath) throws IOException {
- int numberRecords = 0;
- for (Object value : new SequenceFileValueIterable<>(sequenceFilePath, true, getConfiguration())) {
- numberRecords++;
- }
- return numberRecords;
- }
-
- /**
- * Test map reduce version of split input with Text, Text key value
- * pairs in input
- */
- @Test
- public void testSplitInputMapReduceText() throws Exception {
- writeTextSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduce(1000);
- }
-
- /** Test map reduce version of split input with Text, Text key value pairs in input called from command line */
- @Test
- public void testSplitInputMapReduceTextCli() throws Exception {
- writeTextSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduceCli(1000);
- }
-
- /**
- * Test map reduce version of split input with IntWritable, Vector key value
- * pairs in input
- */
- @Test
- public void testSplitInputMapReduceVector() throws Exception {
- writeVectorSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduce(1000);
- }
-
- /**
- * Test map reduce version of split input with IntWritable, Vector key value
- * pairs in input called from command line
- */
- @Test
- public void testSplitInputMapReduceVectorCli() throws Exception {
- writeVectorSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduceCli(1000);
- }
-
- /**
- * Test map reduce version of split input through CLI
- */
- private void testSplitInputMapReduceCli(int numPoints) throws Exception {
- int randomSelectionPct = 25;
- int keepPct = 10;
- String[] args =
- { "--method", "mapreduce", "--input", tempSequenceDirectory.toString(),
- "--mapRedOutputDir", tempMapRedOutputDirectory.toString(),
- "--randomSelectionPct", Integer.toString(randomSelectionPct),
- "--keepPct", Integer.toString(keepPct), "-ow" };
- ToolRunner.run(getConfiguration(), new SplitInput(), args);
- validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
- }
-
- /**
- * Test map reduce version of split input through method call
- */
- private void testSplitInputMapReduce(int numPoints) throws Exception {
- int randomSelectionPct = 25;
- si.setTestRandomSelectionPct(randomSelectionPct);
- int keepPct = 10;
- si.setKeepPct(keepPct);
- si.setMapRedOutputDirectory(tempMapRedOutputDirectory);
- si.setUseMapRed(true);
- si.splitDirectory(getConfiguration(), tempSequenceDirectory);
-
- validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
- }
-
- /**
- * Validate that number of test records and number of training records
- * are consistant with keepPct and randomSelectionPct
- */
- private void validateSplitInputMapReduce(int numPoints, int randomSelectionPct, int keepPct) throws IOException {
- Path testPath = new Path(tempMapRedOutputDirectory, "test-r-00000");
- Path trainingPath = new Path(tempMapRedOutputDirectory, "training-r-00000");
- int numberTestRecords = getNumberRecords(testPath);
- int numberTrainingRecords = getNumberRecords(trainingPath);
- System.out.printf("Test data: %d records\n", numberTestRecords);
- displaySequenceFile(testPath);
- System.out.printf("Training data: %d records\n", numberTrainingRecords);
- displaySequenceFile(trainingPath);
- assertEquals((randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
- numberTestRecords, 2);
- assertEquals(
- (1 - randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
- numberTrainingRecords, 2);
- }
-
- @Test
- public void testValidate() throws Exception {
- SplitInput st = new SplitInput();
- assertValidateException(st);
-
- st.setTestSplitSize(100);
- assertValidateException(st);
-
- st.setTestOutputDirectory(tempTestDirectory);
- assertValidateException(st);
-
- st.setTrainingOutputDirectory(tempTrainingDirectory);
- st.validate();
-
- st.setTestSplitPct(50);
- assertValidateException(st);
-
- st = new SplitInput();
- st.setTestRandomSelectionPct(50);
- st.setTestOutputDirectory(tempTestDirectory);
- st.setTrainingOutputDirectory(tempTrainingDirectory);
- st.validate();
-
- st.setTestSplitPct(50);
- assertValidateException(st);
-
- st = new SplitInput();
- st.setTestRandomSelectionPct(50);
- st.setTestOutputDirectory(tempTestDirectory);
- st.setTrainingOutputDirectory(tempTrainingDirectory);
- st.validate();
-
- st.setTestSplitSize(100);
- assertValidateException(st);
- }
-
- private class TestCallback implements SplitInput.SplitCallback {
- private final int testSplitSize;
- private final int trainingLines;
-
- private TestCallback(int testSplitSize, int trainingLines) {
- this.testSplitSize = testSplitSize;
- this.trainingLines = trainingLines;
- }
-
- @Override
- public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) {
- assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory);
- }
- }
-
- private static void assertValidateException(SplitInput st) throws IOException {
- try {
- st.validate();
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException iae) {
- // good
- }
- }
-
- private static void assertSplit(FileSystem fs,
- Path tempInputFile,
- Charset charset,
- int testSplitSize,
- int trainingLines,
- Path tempTrainingDirectory,
- Path tempTestDirectory) {
-
- try {
- Path testFile = new Path(tempTestDirectory, tempInputFile.getName());
- //assertTrue("test file exists", testFile.isFile());
- assertEquals("test line count", testSplitSize, SplitInput.countLines(fs, testFile, charset));
-
- Path trainingFile = new Path(tempTrainingDirectory, tempInputFile.getName());
- //assertTrue("training file exists", trainingFile.isFile());
- assertEquals("training line count", trainingLines, SplitInput.countLines(fs, trainingFile, charset));
- } catch (IOException ioe) {
- fail(ioe.toString());
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java b/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
deleted file mode 100644
index c519f85..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.email;
-
-import java.io.File;
-import java.io.StringWriter;
-import java.net.URL;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class MailProcessorTest extends MahoutTestCase {
-
- @Test
- public void testLabel() throws Exception {
- StringWriter writer = new StringWriter();
- MailOptions options = new MailOptions();
- options.setSeparator(":::");
- options.setCharset(Charsets.UTF_8);
- options.setPatternsToMatch(new Pattern[]{
- MailProcessor.FROM_PREFIX, MailProcessor.SUBJECT_PREFIX, MailProcessor.TO_PREFIX});
- options.setInput(new File(System.getProperty("user.dir")));
- MailProcessor proc = new MailProcessor(options, "", writer);
- URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox");
- File file = new File(url.toURI());
- long count = proc.parseMboxLineByLine(file);
- assertEquals(7, count);
- }
-
- @Test
- public void testStripQuoted() throws Exception {
- StringWriter writer = new StringWriter();
- MailOptions options = new MailOptions();
- options.setSeparator(":::");
- options.setCharset(Charsets.UTF_8);
- options.setPatternsToMatch(new Pattern[]{
- MailProcessor.SUBJECT_PREFIX});
- options.setInput(new File(System.getProperty("user.dir")));
- options.setIncludeBody(true);
- MailProcessor proc = new MailProcessor(options, "", writer);
- URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox");
- File file = new File(url.toURI());
- long count = proc.parseMboxLineByLine(file);
- assertEquals(7, count);
- assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));
- writer = new StringWriter();
- proc = new MailProcessor(options, "", writer);
- options.setStripQuotedText(true);
- count = proc.parseMboxLineByLine(file);
- assertEquals(7, count);
- assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));
-
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
deleted file mode 100644
index 4fdbbbc..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.utils.nlp.collocations.llr;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetEncoder;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.util.bloom.BloomFilter;
-import org.apache.hadoop.util.bloom.Filter;
-import org.apache.hadoop.util.bloom.Key;
-import org.apache.hadoop.util.hash.Hash;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class BloomTokenFilterTest extends MahoutTestCase {
-
- private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder();
-
- private static final String input = "The best of times the worst of times";
- private static final String[] allTokens = {
- "The", "best", "of", "times", "the", "worst", "of", "times"
- };
- private static final String[] expectedNonKeepTokens = { "best", "times", "the", "worst", "times" };
- private static final String[] expectedKeepTokens = { "The", "of", "of" };
- private static final String[] filterTokens = { "The", "of" };
- private static final String[] notFilterTokens = { "best", "worst", "the", "times"};
- private static final String[] shingleKeepTokens = {
- "The best", "best of times", "the worst", "worst of times", "of times"
- };
- private static final String[] expectedShingleTokens = {
- "The best", "best of times", "of times", "the worst", "worst of times", "of times"
- };
-
- /** test standalone filter without tokenfilter wrapping */
- @Test
- public void testFilter() throws IOException {
- Filter filter = getFilter(filterTokens);
- Key k = new Key();
- for (String s: filterTokens) {
- setKey(k,s);
- assertTrue("Key for string " + s + " should be filter member", filter.membershipTest(k));
- }
-
- for (String s: notFilterTokens) {
- setKey(k,s);
- assertFalse("Key for string " + s + " should not be filter member", filter.membershipTest(k));
- }
- }
-
- /** normal case, unfiltered analyzer */
- @Test
- public void testAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- validateTokens(allTokens, ts);
- ts.end();
- ts.close();
- }
-
- /** filtered analyzer */
- @Test
- public void testNonKeepdAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts);
- validateTokens(expectedNonKeepTokens, f);
- ts.end();
- ts.close();
- }
-
- /** keep analyzer */
- @Test
- public void testKeepAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts);
- validateTokens(expectedKeepTokens, f);
- ts.end();
- ts.close();
- }
-
- /** shingles, keep those matching whitelist */
- @Test
- public void testShingleFilteredAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- ShingleFilter sf = new ShingleFilter(ts, 3);
- TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf);
- validateTokens(expectedShingleTokens, f);
- ts.end();
- ts.close();
- }
-
- private static void setKey(Key k, String s) throws IOException {
- ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray()));
- k.set(buffer.array(), 1.0);
- }
-
- private static void validateTokens(String[] expected, TokenStream ts) throws IOException {
- int pos = 0;
- while (ts.incrementToken()) {
- assertTrue("Analyzer produced too many tokens", pos <= expected.length);
- CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
- assertEquals("Unexpected term", expected[pos++], termAttr.toString());
- }
- assertEquals("Analyzer produced too few terms", expected.length, pos);
- }
-
- private static Filter getFilter(String[] tokens) throws IOException {
- Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH);
- Key k = new Key();
- for (String s: tokens) {
- setKey(k,s);
- filter.add(k);
- }
- return filter;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java b/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
deleted file mode 100644
index 8ab643b..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.common.DummyRecordWriter;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.List;
-
-public final class RegexMapperTest extends MahoutTestCase {
-
- @Test
- public void testRegex() throws Exception {
- RegexMapper mapper = new RegexMapper();
- Configuration conf = getConfiguration();
- conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
- conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
- DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
- Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
- .build(mapper, conf, mapWriter);
-
- mapper.setup(mapContext);
- for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
- String testStr = RegexUtilsTest.TEST_STRS[i];
-
- LongWritable key = new LongWritable(i);
- mapper.map(key, new Text(testStr), mapContext);
- List<Text> value = mapWriter.getValue(key);
- if (!RegexUtilsTest.GOLD[i].isEmpty()) {
- assertEquals(1, value.size());
- assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString());
- }
- }
- }
-
- @Test
- public void testGroups() throws Exception {
- RegexMapper mapper = new RegexMapper();
- Configuration conf = getConfiguration();
- conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)");
- conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
- conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3");
- DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
- Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
- .build(mapper, conf, mapWriter);
-
- mapper.setup(mapContext);
- for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
- String testStr = RegexUtilsTest.TEST_STRS[i];
-
- LongWritable key = new LongWritable(i);
- mapper.map(key, new Text(testStr), mapContext);
- List<Text> value = mapWriter.getValue(key);
- assertEquals(1, value.size());
- assertEquals("127 0", value.get(0).toString());
- }
- }
-
- @Test
- public void testFPGFormatter() throws Exception {
- RegexMapper mapper = new RegexMapper();
- Configuration conf = getConfiguration();
- conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
- conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
- conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName());
- DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
- Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
- .build(mapper, conf, mapWriter);
-
- mapper.setup(mapContext);
- RegexFormatter formatter = new FPGFormatter();
- for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
- String testStr = RegexUtilsTest.TEST_STRS[i];
-
- LongWritable key = new LongWritable(i);
- mapper.map(key, new Text(testStr), mapContext);
- List<Text> value = mapWriter.getValue(key);
- if (!RegexUtilsTest.GOLD[i].isEmpty()) {
- assertEquals(1, value.size());
- assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), value.get(0).toString());
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java b/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
deleted file mode 100644
index 8ae10a5..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.regex.Pattern;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class RegexUtilsTest extends MahoutTestCase {
-
- static final String[] TEST_STRS = {
- "127.0.0.1 - - [01/10/2011:00:01:51 +0000] \"GET /solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content",
- "127.0.0.1 - - [01/10/2011:00:20:58 +0000] \"GET /solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10",
- "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 45071",
- "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071"
- };
- static final String[] GOLD = {"foo", "Using Solr Search RDBMS", "language detection", ""};
-
- @Test
- public void testExtract() throws Exception {
- Pattern pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)");
- String line = "127.0.0.1 - - [24/05/2010:01:19:22 +0000] \"GET /solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571";
- String res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
- assertEquals(res, "import statement", res);
-
- for (int i = 0; i < TEST_STRS.length; i++) {
- String testStr = TEST_STRS[i];
- res = RegexUtils.extract(testStr, pattern, Collections.<Integer>emptyList(), " ", new URLDecodeTransformer());
- assertEquals(GOLD[i], res);
- }
-
- pattern = Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))");
- res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
- assertEquals(res, "import statement 1", res);
-
- pattern = Pattern.compile("(start=1) HTTP");
- Collection<Integer> groupsToKeep = new ArrayList<>();
- groupsToKeep.add(1);
- res = RegexUtils.extract(line, pattern, groupsToKeep, " ", RegexUtils.IDENTITY_TRANSFORMER);
- assertEquals(res, "start=1", res);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java b/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
deleted file mode 100644
index 2ddce14..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Iterator;
-import java.util.Random;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterators;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.iterator.CountingIterator;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.DoubleFunction;
-
-public final class RandomVectorIterable implements Iterable<Vector> {
-
- public enum VectorType {DENSE, SPARSE}
-
- private final int numItems;
- private final VectorType type;
-
- public RandomVectorIterable() {
- this(100, VectorType.SPARSE);
- }
-
- public RandomVectorIterable(int numItems) {
- this(numItems, VectorType.SPARSE);
- }
-
- public RandomVectorIterable(int numItems, VectorType type) {
- this.numItems = numItems;
- this.type = type;
- }
-
- @Override
- public Iterator<Vector> iterator() {
- return Iterators.transform(
- new CountingIterator(numItems),
- new Function<Integer, Vector>() {
- private final Random random = RandomUtils.getRandom();
- @Override
- public Vector apply(Integer dummy) {
- Vector result =
- type == VectorType.SPARSE ? new RandomAccessSparseVector(numItems) : new DenseVector(numItems);
- result.assign(new DoubleFunction() {
- @Override
- public double apply(double ignored) {
- return random.nextDouble();
- }
- });
- return result;
- }
- });
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
deleted file mode 100644
index c55fd8d..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class VectorHelperTest extends MahoutTestCase {
-
- private static final int NUM_DOCS = 100;
-
- private Path inputPathOne;
- private Path inputPathTwo;
-
- private Configuration conf;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- conf = getConfiguration();
-
- inputPathOne = getTestTempFilePath("documents/docs-one.file");
- FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf);
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class)) {
- Random rd = RandomUtils.getRandom();
- for (int i = 0; i < NUM_DOCS; i++) {
- // Make all indices higher than dictionary size
- writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS)));
- }
- }
-
- inputPathTwo = getTestTempFilePath("documents/docs-two.file");
- fs = FileSystem.get(inputPathTwo.toUri(), conf);
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class)) {
- Random rd = RandomUtils.getRandom();
- for (int i = 0; i < NUM_DOCS; i++) {
- // Keep indices within number of documents
- writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS)));
- }
- }
- }
-
- @Test
- public void testJsonFormatting() throws Exception {
- Vector v = new SequentialAccessSparseVector(10);
- v.set(2, 3.1);
- v.set(4, 1.0);
- v.set(6, 8.1);
- v.set(7, -100);
- v.set(9, 12.2);
- String UNUSED = "UNUSED";
- String[] dictionary = {
- UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, "nine"
- };
-
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}",
- VectorHelper.vectorToJson(v, dictionary, 3, true));
- assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
- VectorHelper.vectorToJson(v, dictionary, 2, false));
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0}",
- VectorHelper.vectorToJson(v, dictionary, 4, true));
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}",
- VectorHelper.vectorToJson(v, dictionary, 5, true));
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}",
- VectorHelper.vectorToJson(v, dictionary, 2, true));
- assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
- VectorHelper.vectorToJson(v, dictionary, 2, false));
- }
-
- @Test
- public void testTopEntries() throws Exception {
- Vector v = new SequentialAccessSparseVector(10);
- v.set(2, 3.1);
- v.set(4, 1.0);
- v.set(6, 8.1);
- v.set(7, -100);
- v.set(9, 12.2);
- v.set(1, 0.0);
- v.set(3, 0.0);
- v.set(8, 2.7);
- // check if sizeOFNonZeroElementsInVector = maxEntries
- assertEquals(6, VectorHelper.topEntries(v, 6).size());
- // check if sizeOfNonZeroElementsInVector < maxEntries
- assertTrue(VectorHelper.topEntries(v, 9).size() < 9);
- // check if sizeOfNonZeroElementsInVector > maxEntries
- assertTrue(VectorHelper.topEntries(v, 5).size() < v.getNumNonZeroElements());
- }
-
- @Test
- public void testTopEntriesWhenAllZeros() throws Exception {
- Vector v = new SequentialAccessSparseVector(10);
- v.set(2, 0.0);
- v.set(4, 0.0);
- v.set(6, 0.0);
- v.set(7, 0);
- v.set(9, 0.0);
- v.set(1, 0.0);
- v.set(3, 0.0);
- v.set(8, 0.0);
- assertEquals(0, VectorHelper.topEntries(v, 6).size());
- }
-
- @Test
- public void testLoadTermDictionary() throws Exception {
- // With indices higher than dictionary size
- VectorHelper.loadTermDictionary(conf, inputPathOne.toString());
- // With dictionary size higher than indices
- VectorHelper.loadTermDictionary(conf, inputPathTwo.toString());
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
deleted file mode 100644
index 2ea8b89..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class ARFFTypeTest extends MahoutTestCase {
-
- @Test
- public void removeQuotes() {
- assertNull(ARFFType.removeQuotes(null));
- assertEquals("", ARFFType.removeQuotes("\"\""));
- assertEquals("", ARFFType.removeQuotes("''"));
- assertEquals("", ARFFType.removeQuotes(""));
- assertEquals("", ARFFType.removeQuotes(" "));
- assertEquals("single", ARFFType.removeQuotes("'single'"));
- assertEquals("double", ARFFType.removeQuotes("\"double\""));
- assertEquals("trim", ARFFType.removeQuotes(" trim "));
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
deleted file mode 100644
index 4c7f17a..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
+++ /dev/null
@@ -1,289 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.IOException;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Test;
-
-public final class ARFFVectorIterableTest extends MahoutTestCase {
-
- @Test
- public void testValues() throws Exception {
- ARFFVectorIterable iterable = readModelFromResource("sample.arff");
-
- assertEquals("Mahout", iterable.getModel().getRelation());
- Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
- assertNotNull(bindings);
- assertEquals(5, bindings.size());
- Iterator<Vector> iter = iterable.iterator();
- assertTrue(iter.hasNext());
- Vector next = iter.next();
- assertNotNull(next);
- assertTrue("Wrong instanceof", next instanceof DenseVector);
- assertEquals(1.0, next.get(0), EPSILON);
- assertEquals(2.0, next.get(1), EPSILON);
- assertTrue(iter.hasNext());
- next = iter.next();
- assertNotNull(next);
- assertTrue("Wrong instanceof", next instanceof DenseVector);
- assertEquals(2.0, next.get(0), EPSILON);
- assertEquals(3.0, next.get(1), EPSILON);
-
- assertTrue(iter.hasNext());
- next = iter.next();
- assertNotNull(next);
- assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector);
- assertEquals(5.0, next.get(0), EPSILON);
- assertEquals(23.0, next.get(1), EPSILON);
-
- assertFalse(iter.hasNext());
- }
-
- @Test
- public void testDense() throws Exception {
- Iterable<Vector> iterable = readModelFromResource("sample-dense.arff");
- Vector firstVector = iterable.iterator().next();
- assertEquals(1.0, firstVector.get(0), 0);
- assertEquals(65.0, firstVector.get(1), 0);
- assertEquals(1.0, firstVector.get(3), 0);
- assertEquals(1.0, firstVector.get(4), 0);
-
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof DenseVector);
- count++;
- }
- assertEquals(5, count);
- }
-
- @Test
- public void testSparse() throws Exception {
- Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff");
-
- Vector firstVector = iterable.iterator().next();
- assertEquals(23.1, firstVector.get(1), 0);
- assertEquals(3.23, firstVector.get(2), 0);
- assertEquals(1.2, firstVector.get(3), 0);
-
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
- assertEquals(9, count);
- }
-
- @Test
- public void testNonNumeric() throws Exception {
- MapBackedARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
-
- iterable = getVectors("non-numeric-1.arff", model);
- Iterator<Vector> iter = iterable.iterator();
- Vector firstVector = iter.next();
-
- assertEquals(1.0, firstVector.get(2), 0);
-
- assertEquals(10, count);
- Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
- assertNotNull(nominalMap);
- assertEquals(1, nominalMap.size());
- Map<String, Integer> noms = nominalMap.get("bar");
- assertNotNull("nominals for bar are null", noms);
- assertEquals(5, noms.size());
- Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
- assertNotNull("Type map null", integerARFFTypeMap);
- assertEquals(5, integerARFFTypeMap.size());
- Map<String, Long> words = model.getWords();
- assertNotNull("words null", words);
- assertEquals(10, words.size());
- Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
- assertNotNull("date format null", integerDateFormatMap);
- assertEquals(1, integerDateFormatMap.size());
- }
-
- @Test
- public void testDate() throws Exception {
- ARFFVectorIterable iterable = readModelFromResource("date.arff");
- Iterator<Vector> iter = iterable.iterator();
- Vector firstVector = iter.next();
-
- DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
- Date date = format.parse("2001-07-04T12:08:56");
- long result = date.getTime();
- assertEquals(result, firstVector.get(1), 0);
-
- format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", Locale.ENGLISH);
- date = format.parse("2001.07.04 AD at 12:08:56 PDT");
- result = date.getTime();
- assertEquals(result, firstVector.get(2), 0);
-
- format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH);
- date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT");
- result = date.getTime();
- assertEquals(result, firstVector.get(3), 0);
-
- format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH);
- date = format.parse("0:08 PM, PDT");
- result = date.getTime();
- assertEquals(result, firstVector.get(4), 0);
-
- format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", Locale.ENGLISH);
- date = format.parse("02001.July.04 AD 12:08 PM");
- result = date.getTime();
- assertEquals(result, firstVector.get(5), 0);
-
- format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH);
- date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700");
- result = date.getTime();
- assertEquals(result, firstVector.get(6), 0);
-
- }
-
- @Test
- public void testMultipleNoms() throws Exception {
- MapBackedARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
- assertEquals(10, count);
- Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap();
- assertNotNull(nominalMap);
- assertEquals(1, nominalMap.size());
- Map<String,Integer> noms = nominalMap.get("bar");
- assertNotNull("nominals for bar are null", noms);
- assertEquals(5, noms.size());
- Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
- assertNotNull("Type map null", integerARFFTypeMap);
- assertEquals(5, integerARFFTypeMap.size());
- Map<String,Long> words = model.getWords();
- assertNotNull("words null", words);
- assertEquals(10, words.size());
-
- Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap();
- assertNotNull("date format null", integerDateFormatMap);
- assertEquals(1, integerDateFormatMap.size());
-
-
- iterable = getVectors("non-numeric-2.arff", model);
- count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
- nominalMap = model.getNominalMap();
- assertNotNull(nominalMap);
- assertEquals(2, nominalMap.size());
- noms = nominalMap.get("test");
- assertNotNull("nominals for bar are null", noms);
- assertEquals(2, noms.size());
- }
-
- @Test
- public void testNumerics() throws Exception {
- String arff = "@RELATION numerics\n"
- + "@ATTRIBUTE theNumeric NUMERIC\n"
- + "@ATTRIBUTE theInteger INTEGER\n"
- + "@ATTRIBUTE theReal REAL\n"
- + "@DATA\n"
- + "1.0,2,3.0";
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
- model = iterable.getModel();
- assertNotNull(model);
- assertEquals(3, model.getLabelSize());
- assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
- assertEquals(ARFFType.INTEGER, model.getARFFType(1));
- assertEquals(ARFFType.REAL, model.getARFFType(2));
- Iterator<Vector> it = iterable.iterator();
- Vector vector = it.next();
- assertEquals(1.0, vector.get(0), EPSILON);
- assertEquals(2.0, vector.get(1), EPSILON);
- assertEquals(3.0, vector.get(2), EPSILON);
- }
-
- @Test
- public void testQuotes() throws Exception {
- // ARFF allows quotes on identifiers
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = getVectors("quoted-id.arff", model);
- model = iterable.getModel();
- assertNotNull(model);
- assertEquals("quotes", model.getRelation());
-
- // check attribute labels
- assertEquals(4, model.getLabelSize());
- assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
- assertEquals(ARFFType.INTEGER, model.getARFFType(1));
- assertEquals(ARFFType.REAL, model.getARFFType(2));
- assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
-
- Map<String, Integer> labelBindings = model.getLabelBindings();
- assertTrue(labelBindings.keySet().contains("thenumeric"));
- assertTrue(labelBindings.keySet().contains("theinteger"));
- assertTrue(labelBindings.keySet().contains("thereal"));
- assertTrue(labelBindings.keySet().contains("thenominal"));
-
- // check nominal values
- Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
- assertNotNull(nominalMap);
- assertEquals(3, nominalMap.size());
- assertTrue(nominalMap.keySet().contains("double-quote"));
- assertTrue(nominalMap.keySet().contains("single-quote"));
- assertTrue(nominalMap.keySet().contains("no-quote"));
-
- // check data values
- Iterator<Vector> it = iterable.iterator();
- Vector vector = it.next();
- assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
- assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
- assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
- }
-
- static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException {
- String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8);
- return new ARFFVectorIterable(sample, model);
- }
-
- private static ARFFVectorIterable readModelFromResource(String resourceName) throws IOException {
- ARFFModel model = new MapBackedARFFModel();
- return getVectors(resourceName, model);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
deleted file mode 100644
index 7e7623e..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.IOException;
-import java.io.StringWriter;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-/**
- * Test case for {@link Driver}
- */
-public class DriverTest extends MahoutTestCase {
-
- @Test
- public void dictionary() throws IOException {
-
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
- StringWriter writer = new StringWriter();
- Driver.writeLabelBindings(writer, model, ",");
- String expected1 = Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), Charsets.UTF_8);
- String expected2 = Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), Charsets.UTF_8);
- assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString()));
- }
-
-
- @Test
- public void dictionaryJSON() throws IOException {
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
- StringWriter writer = new StringWriter();
- Driver.writeLabelBindingsJSON(writer, model);
- String expected1 = Resources.toString(Resources.getResource("expected-arff-schema.json"), Charsets.UTF_8);
- String expected2 = Resources.toString(Resources.getResource("expected-arff-schema-2.json"), Charsets.UTF_8);
- assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString()));
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
deleted file mode 100644
index 2867640..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.Map;
-
-public class MapBackedARFFModelTest extends MahoutTestCase {
-
- @Test
- public void processNominal() {
- String windy = "windy";
- String breezy = "breezy";
-
- ARFFModel model = new MapBackedARFFModel();
- model.addNominal(windy, breezy, 77);
- model.addNominal(windy, "strong", 23);
- model.addNominal(windy, "nuking", 55);
- Map<String, Map<String, Integer>> nominalMap = model.getNominalMap();
-
- assertEquals(1, nominalMap.size());
- Map<String, Integer> windyValues = nominalMap.get(windy);
- assertEquals(77, windyValues.get(breezy).intValue());
- }
-
- @Test
- public void processBadNumeric() {
- ARFFModel model = new MapBackedARFFModel();
- model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77);
- model.addType(77, ARFFType.REAL);
- assertTrue(Double.isNaN(model.getValue("b1shkt70694difsmmmdv0ikmoh", 77)));
- }
-
- @Test
- public void processGoodNumeric() {
- ARFFModel model = new MapBackedARFFModel();
- model.addLabel("1234", 77);
- model.addType(77, ARFFType.INTEGER);
- assertTrue(1234 == model.getValue("1234", 77));
- model.addLabel("131.34", 78);
- model.addType(78, ARFFType.REAL);
- assertTrue(131.34 == model.getValue("131.34", 78));
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
deleted file mode 100644
index e76cf70..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.csv;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.io.StringWriter;
-import java.util.Iterator;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.RandomVectorIterable;
-import org.apache.mahout.utils.vectors.VectorHelper;
-import org.apache.mahout.utils.vectors.io.TextualVectorWriter;
-import org.junit.Test;
-
-public class CSVVectorIteratorTest extends MahoutTestCase {
-
- @Test
- public void testCount() throws Exception {
-
- StringWriter sWriter = new StringWriter();
- try (TextualVectorWriter writer = new TextualVectorWriter(sWriter) {
- @Override
- public void write(Vector vector) throws IOException {
- String vecStr = VectorHelper.vectorToCSVString(vector, false);
- getWriter().write(vecStr);
- }
- }) {
- Iterable<Vector> iter = new RandomVectorIterable(50);
- writer.write(iter);
- }
-
- Iterator<Vector> csvIter = new CSVVectorIterator(new StringReader(sWriter.getBuffer().toString()));
- int count = 0;
- while (csvIter.hasNext()) {
- csvIter.next();
- count++;
- }
- assertEquals(50, count);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
deleted file mode 100644
index e2f7032..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Collection;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.RandomVectorIterable;
-import org.junit.Test;
-
-public final class VectorWriterTest extends MahoutTestCase {
-
- @Test
- public void testSFVW() throws Exception {
- Path path = getTestTempFilePath("sfvw");
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
- try (SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter)) {
- writer.write(new RandomVectorIterable(50));
- }
-
- long count = HadoopUtil.countRecords(path, conf);
- assertEquals(50, count);
- }
-
- @Test
- public void testTextOutputSize() throws Exception {
- StringWriter strWriter = new StringWriter();
- try (VectorWriter writer = new TextualVectorWriter(strWriter)) {
- Collection<Vector> vectors = new ArrayList<>();
- vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5}));
- vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5}));
- writer.write(vectors);
- }
- String buffer = strWriter.toString();
- assertNotNull(buffer);
- assertFalse(buffer.isEmpty());
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
deleted file mode 100644
index 890a14b..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-
-import java.io.IOException;
-
-import com.google.common.io.Closeables;
-
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-public class CachedTermInfoTest extends MahoutTestCase {
- private RAMDirectory directory;
- private static final String[] DOCS = {
- "a a b b c c",
- "a b a b a b a b",
- "a b a",
- "a",
- "b",
- "a",
- "a"
- };
-
- private static final String[] DOCS2 = {
- "d d d d",
- "e e e e",
- "d e d e",
- "d",
- "e",
- "d",
- "e"
- };
-
- @Before
- public void before() throws IOException {
- directory = new RAMDirectory();
-
- FieldType fieldType = new FieldType();
- fieldType.setStored(false);
- fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- fieldType.setTokenized(true);
- fieldType.setStoreTermVectors(false);
- fieldType.setStoreTermVectorPositions(false);
- fieldType.setStoreTermVectorOffsets(false);
- fieldType.freeze();
-
- directory = createTestIndex(fieldType, directory, 0);
- }
-
- @Test
- public void test() throws Exception {
- IndexReader reader = DirectoryReader.open(directory);
- CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100);
- assertEquals(3, cti.totalTerms("content"));
- assertNotNull(cti.getTermEntry("content", "a"));
- assertNull(cti.getTermEntry("content", "e"));
- //minDf
- cti = new CachedTermInfo(reader, "content", 3, 100);
- assertEquals(2, cti.totalTerms("content"));
- assertNotNull(cti.getTermEntry("content", "a"));
- assertNull(cti.getTermEntry("content", "c"));
- //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to exclude, 85% should suffice to exclude a
- cti = new CachedTermInfo(reader, "content", 0, 85);
- assertEquals(2, cti.totalTerms("content"));
- assertNotNull(cti.getTermEntry("content", "b"));
- assertNotNull(cti.getTermEntry("content", "c"));
- assertNull(cti.getTermEntry("content", "a"));
-
-
- }
-
- static RAMDirectory createTestIndex(FieldType fieldType,
- RAMDirectory directory,
- int startingId) throws IOException {
- IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer()));
-
- try {
- for (int i = 0; i < DOCS.length; i++) {
- Document doc = new Document();
- Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
- doc.add(id);
- Field text = new Field("content", DOCS[i], fieldType);
- doc.add(text);
- Field text2 = new Field("content2", DOCS2[i], fieldType);
- doc.add(text2);
- writer.addDocument(doc);
- }
- } finally {
- Closeables.close(writer, false);
- }
- return directory;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
deleted file mode 100644
index 86c8305..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import com.google.common.collect.Sets;
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Paths;
-import java.util.Set;
-
-public class DriverTest extends MahoutTestCase {
-
- private File indexDir;
- private File outputDir;
- private Configuration conf;
-
- @Before
- @Override
- public void setUp() throws Exception {
- super.setUp();
- indexDir = getTestTempDir("intermediate");
- indexDir.delete();
- outputDir = getTestTempDir("output");
- outputDir.delete();
-
- conf = getConfiguration();
- }
-
- private Document asDocument(String line) {
- Document doc = new Document();
- doc.add(new TextFieldWithTermVectors("text", line));
- return doc;
- }
-
- static class TextFieldWithTermVectors extends Field {
-
- public static final FieldType TYPE = new FieldType();
-
- static {
- TYPE.setOmitNorms(true);
- TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
- TYPE.setStored(true);
- TYPE.setTokenized(true);
- TYPE.setStoreTermVectors(true);
- TYPE.freeze();
- }
-
- public TextFieldWithTermVectors(String name, String value) {
- super(name, value, TYPE);
- }
- }
-
- @Test
- public void sequenceFileDictionary() throws IOException {
-
- Directory index = new SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath()));
- Analyzer analyzer = new StandardAnalyzer();
- IndexWriterConfig config = new IndexWriterConfig(analyzer);
- config.setCommitOnClose(true);
- final IndexWriter writer = new IndexWriter(index, config);
-
- try {
- writer.addDocument(asDocument("One Ring to rule them all"));
- writer.addDocument(asDocument("One Ring to find them,"));
- writer.addDocument(asDocument("One Ring to bring them all"));
- writer.addDocument(asDocument("and in the darkness bind them"));
- } finally {
- writer.close();
- }
-
- File seqDict = new File(outputDir, "dict.seq");
-
- Driver.main(new String[] {
- "--dir", indexDir.getAbsolutePath(),
- "--output", new File(outputDir, "out").getAbsolutePath(),
- "--field", "text",
- "--dictOut", new File(outputDir, "dict.txt").getAbsolutePath(),
- "--seqDictOut", seqDict.getAbsolutePath(),
- });
-
- SequenceFile.Reader reader = null;
- Set<String> indexTerms = Sets.newHashSet();
- try {
- reader = new SequenceFile.Reader(FileSystem.getLocal(conf), new Path(seqDict.getAbsolutePath()), conf);
- Text term = new Text();
- IntWritable termIndex = new IntWritable();
-
- while (reader.next(term, termIndex)) {
- indexTerms.add(term.toString());
- }
- } finally {
- Closeables.close(reader, true);
- }
-
- Set<String> expectedIndexTerms = Sets.newHashSet("all", "bind", "bring", "darkness", "find", "one", "ring", "rule");
-
- // should contain the same terms as expected
- assertEquals(expectedIndexTerms.size(), Sets.union(expectedIndexTerms, indexTerms).size());
- }
-}

r***@apache.org

2018-06-27 14:51:54 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
deleted file mode 100644
index 6a8c659..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.google.common.base.Preconditions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-/**
- * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source
- * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have
- * term vectors stored for it.
- */
-public class LuceneIterator extends AbstractLuceneIterator {
-
- protected final Set<String> idFieldSelector;
- protected final String idField;
-
- /**
- * Produce a LuceneIterable that can create the Vector plus normalize it.
- *
- * @param indexReader {@link IndexReader} to read the documents from.
- * @param idField field containing the id. May be null.
- * @param field field to use for the Vector
- * @param termInfo termInfo
- * @param weight weight
- * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
- */
- public LuceneIterator(IndexReader indexReader, String idField, String field, TermInfo termInfo, Weight weight,
- double normPower) {
- this(indexReader, idField, field, termInfo, weight, normPower, 0.0);
- }
-
- /**
- * @param indexReader {@link IndexReader} to read the documents from.
- * @param idField field containing the id. May be null.
- * @param field field to use for the Vector
- * @param termInfo termInfo
- * @param weight weight
- * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
- * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
- * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo,
- * org.apache.mahout.vectorizer.Weight, double)
- */
- public LuceneIterator(IndexReader indexReader,
- String idField,
- String field,
- TermInfo termInfo,
- Weight weight,
- double normPower,
- double maxPercentErrorDocs) {
- super(termInfo, normPower, indexReader, weight, maxPercentErrorDocs, field);
- // term docs(null) is a better way of iterating all the docs in Lucene
- Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0,
- "normPower must be non-negative or -1, but normPower = " + normPower);
- Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0,
- "Must be: 0.0 <= maxPercentErrorDocs <= 1.0");
- this.idField = idField;
- if (idField != null) {
- idFieldSelector = new TreeSet<>();
- idFieldSelector.add(idField);
- } else {
- /*The field in the index containing the index. If null, then the Lucene internal doc id is used
- which is prone to error if the underlying index changes*/
- idFieldSelector = null;
- }
- }
-
- @Override
- protected String getVectorName(int documentIndex) throws IOException {
- String name;
- if (idField != null) {
- name = indexReader.document(documentIndex, idFieldSelector).get(idField);
- } else {
- name = String.valueOf(documentIndex);
- }
- return name;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
deleted file mode 100644
index 5830ccc..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-
-/**
- * Not thread-safe
- */
-public class TFDFMapper {
-
- private Vector vector;
-
- private final Weight weight;
- private long numTerms;
- private final TermInfo termInfo;
- private String field;
- private final int numDocs;
-
- public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo) {
- this.weight = weight;
- this.termInfo = termInfo;
- this.numDocs = numDocs;
- }
-
- public void setExpectations(String field, long numTerms) {
- this.field = field;
- vector = new RandomAccessSparseVector(termInfo.totalTerms(field));
- this.numTerms = numTerms;
- }
-
- public void map(BytesRef term, int frequency) {
- TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString());
- if (entry != null) {
- vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int)numTerms, numDocs));
- }
- }
-
- public Vector getVector() {
- return this.vector;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
deleted file mode 100644
index b0311c7..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.mahout.common.RandomUtils;
-
-class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> {
-
- private final String term;
- private final int inClusterDF;
- private final int outClusterDF;
- private final double logLikelihoodRatio;
-
- TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) {
- this.term = term;
- this.inClusterDF = inClusterDF;
- this.outClusterDF = outClusterDF;
- this.logLikelihoodRatio = logLikelihoodRatio;
- }
-
- @Override
- public int hashCode() {
- return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio);
- }
-
- @Override
- public boolean equals(Object o) {
- if (!(o instanceof TermInfoClusterInOut)) {
- return false;
- }
- TermInfoClusterInOut other = (TermInfoClusterInOut) o;
- return term.equals(other.getTerm())
- && inClusterDF == other.getInClusterDF()
- && outClusterDF == other.getOutClusterDF()
- && logLikelihoodRatio == other.getLogLikelihoodRatio();
- }
-
- @Override
- public int compareTo(TermInfoClusterInOut that) {
- int res = Double.compare(that.logLikelihoodRatio, logLikelihoodRatio);
- if (res == 0) {
- res = term.compareTo(that.term);
- }
- return res;
- }
-
- public int getInClusterDiff() {
- return this.inClusterDF - this.outClusterDF;
- }
-
- String getTerm() {
- return term;
- }
-
- int getInClusterDF() {
- return inClusterDF;
- }
-
- int getOutClusterDF() {
- return outClusterDF;
- }
-
- double getLogLikelihoodRatio() {
- return logLikelihoodRatio;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java b/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
deleted file mode 100644
index 463a45f..0000000
--- a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.impl.TasteTestCase;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.easymock.EasyMock;
-import org.junit.Test;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-
-public class MySQLJDBCInMemoryItemSimilarityTest extends TasteTestCase {
-
- @Test
- public void testMemoryLoad() throws Exception {
-
- DataSource dataSource = EasyMock.createMock(DataSource.class);
- Connection connection = EasyMock.createMock(Connection.class);
- PreparedStatement statement = EasyMock.createMock(PreparedStatement.class);
- ResultSet resultSet = EasyMock.createMock(ResultSet.class);
-
- EasyMock.expect(dataSource.getConnection()).andReturn(connection);
- EasyMock.expect(connection.prepareStatement(MySQLJDBCInMemoryItemSimilarity.DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL,
- ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)).andReturn(statement);
- statement.setFetchDirection(ResultSet.FETCH_FORWARD);
- EasyMock.expect(statement.executeQuery()).andReturn(resultSet);
-
- EasyMock.expect(resultSet.next()).andReturn(true);
-
- EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
- EasyMock.expect(resultSet.getLong(2)).andReturn(2L);
- EasyMock.expect(resultSet.getDouble(3)).andReturn(0.5);
- EasyMock.expect(resultSet.next()).andReturn(true);
-
- EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
- EasyMock.expect(resultSet.getLong(2)).andReturn(3L);
- EasyMock.expect(resultSet.getDouble(3)).andReturn(0.4);
- EasyMock.expect(resultSet.next()).andReturn(true);
-
- EasyMock.expect(resultSet.getLong(1)).andReturn(3L);
- EasyMock.expect(resultSet.getLong(2)).andReturn(4L);
- EasyMock.expect(resultSet.getDouble(3)).andReturn(0.1);
-
- EasyMock.expect(resultSet.next()).andReturn(false);
-
- resultSet.close();
- statement.close();
- connection.close();
-
- EasyMock.replay(dataSource, connection, statement, resultSet);
-
- ItemSimilarity similarity = new MySQLJDBCInMemoryItemSimilarity(dataSource);
-
- assertEquals(0.5, similarity.itemSimilarity(1L, 2L), EPSILON);
- assertEquals(0.4, similarity.itemSimilarity(1L, 3L), EPSILON);
- assertEquals(0.1, similarity.itemSimilarity(3L, 4L), EPSILON);
- assertTrue(Double.isNaN(similarity.itemSimilarity(1L, 4L)));
-
- EasyMock.verify(dataSource, connection, statement, resultSet);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
deleted file mode 100644
index 01d46fc..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
-import org.apache.mahout.utils.vectors.lucene.LuceneIterable;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class TestClusterDumper extends MahoutTestCase {
-
- private static final String[] DOCS = {
- "The quick red fox jumped over the lazy brown dogs.",
- "The quick brown fox jumped over the lazy red dogs.",
- "The quick red cat jumped over the lazy brown dogs.",
- "The quick brown cat jumped over the lazy red dogs.",
- "Mary had a little lamb whose fleece was white as snow.",
- "Mary had a little goat whose fleece was white as snow.",
- "Mary had a little lamb whose fleece was black as tar.",
- "Dick had a little goat whose fleece was white as snow.",
- "Moby Dick is a story of a whale and a man obsessed.",
- "Moby Bob is a story of a walrus and a man obsessed.",
- "Moby Dick is a story of a whale and a crazy man.",
- "The robber wore a black fleece jacket and a baseball cap.",
- "The robber wore a red fleece jacket and a baseball cap.",
- "The robber wore a white fleece jacket and a baseball cap.",
- "The English Springer Spaniel is the best of all dogs."};
-
- private List<VectorWritable> sampleData;
-
- private String[] termDictionary;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Create test data
- getSampleData(DOCS);
- ClusteringTestUtils.writePointsToFile(sampleData, true,
- getTestTempFilePath("testdata/file1"), fs, conf);
- }
-
- private void getSampleData(String[] docs2) throws IOException {
- sampleData = new ArrayList<>();
- RAMDirectory directory = new RAMDirectory();
- try (IndexWriter writer = new IndexWriter(directory,
- new IndexWriterConfig(new StandardAnalyzer()))){
- for (int i = 0; i < docs2.length; i++) {
- Document doc = new Document();
- Field id = new StringField("id", "doc_" + i, Field.Store.YES);
- doc.add(id);
- // Store both position and offset information
- FieldType fieldType = new FieldType();
- fieldType.setStored(false);
- fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- fieldType.setTokenized(true);
- fieldType.setStoreTermVectors(true);
- fieldType.setStoreTermVectorPositions(true);
- fieldType.setStoreTermVectorOffsets(true);
- fieldType.freeze();
- Field text = new Field("content", docs2[i], fieldType);
- doc.add(text);
- writer.addDocument(doc);
- }
- }
-
- IndexReader reader = DirectoryReader.open(directory);
-
- Weight weight = new TFIDF();
- TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-
- int numTerms = 0;
- for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
- it.next();
- numTerms++;
- }
- termDictionary = new String[numTerms];
- int i = 0;
- for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
- String term = it.next().getTerm();
- termDictionary[i] = term;
- System.out.println(i + " " + term);
- i++;
- }
- Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content",
- termInfo,weight);
-
- i = 0;
- for (Vector vector : iterable) {
- assertNotNull(vector);
- NamedVector namedVector;
- if (vector instanceof NamedVector) {
- // rename it for testing purposes
- namedVector = new NamedVector(((NamedVector) vector).getDelegate(),
- "P(" + i + ')');
-
- } else {
- namedVector = new NamedVector(vector, "P(" + i + ')');
- }
- System.out.println(AbstractCluster.formatVector(namedVector,
- termDictionary));
- sampleData.add(new VectorWritable(namedVector));
- i++;
- }
- }
-
- /**
- * Return the path to the final iteration's clusters
- */
- private static Path finalClusterPath(Configuration conf, Path output,
- int maxIterations) throws IOException {
- FileSystem fs = FileSystem.get(conf);
- for (int i = maxIterations; i >= 0; i--) {
- Path clusters = new Path(output, "clusters-" + i + "-final");
- if (fs.exists(clusters)) {
- return clusters;
- }
- }
- return null;
- }
-
- @Test
- public void testKmeans() throws Exception {
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Path input = getTestTempFilePath("input");
- Path output = getTestTempDirPath("output");
- Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Write test data to file
- ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
- // Select initial centroids
- RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
- // Run k-means
- Path kMeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false);
- // Print out clusters
- ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
- output, 10), new Path(kMeansOutput, "clusteredPoints"));
- clusterDumper.printClusters(termDictionary);
- }
-
- @Test
- public void testJsonClusterDumper() throws Exception {
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Path input = getTestTempFilePath("input");
- Path output = getTestTempDirPath("output");
- Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Write test data to file
- ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
- // Select initial centroids
- RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
- // Run k-means
- Path kmeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false);
- // Print out clusters
- ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
- output, 10), new Path(kmeansOutput, "clusteredPoints"));
- clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
- clusterDumper.printClusters(termDictionary);
- }
-
- @Test
- public void testFuzzyKmeans() throws Exception {
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Path input = getTestTempFilePath("input");
- Path output = getTestTempDirPath("output");
- Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Write test data to file
- ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
- // Select initial centroids
- RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
- // Run k-means
- Path kMeansOutput = new Path(output, "kmeans");
- FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true,
- true, 0, true);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
- output, 10), new Path(kMeansOutput, "clusteredPoints"));
- clusterDumper.printClusters(termDictionary);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
deleted file mode 100644
index 8a226a0..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
+++ /dev/null
@@ -1,321 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.canopy.Canopy;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-public final class TestClusterEvaluator extends MahoutTestCase {
-
- private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
-
- private List<VectorWritable> referenceData = Lists.newArrayList();
-
- private final List<VectorWritable> sampleData = Lists.newArrayList();
-
- private Map<Integer,List<VectorWritable>> representativePoints;
-
- private List<Cluster> clusters;
-
- private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
-
- private Configuration conf;
-
- private FileSystem fs;
-
- private Path testdata;
-
- private Path output;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- conf = getConfiguration();
- fs = FileSystem.get(conf);
- testdata = getTestTempDirPath("testdata");
- output = getTestTempDirPath("output");
- // Create small reference data set
- referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
- // generate larger test data set for the clustering tests to chew on
- generateSamples();
- }
-
- /**
- * Generate random samples and add them to the sampleData
- *
- * @param num
- * int number of samples to generate
- * @param mx
- * double x-value of the sample mean
- * @param my
- * double y-value of the sample mean
- * @param sd
- * double standard deviation of the samples
- */
- private void generateSamples(int num, double mx, double my, double sd) {
- log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
- for (int i = 0; i < num; i++) {
- sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
- UncommonDistributions.rNorm(my, sd)})));
- }
- }
-
- private void generateSamples() {
- generateSamples(500, 1, 1, 3);
- generateSamples(300, 1, 0, 0.5);
- generateSamples(300, 0, 2, 0.1);
- }
-
- private void printRepPoints(int numIterations) {
- RepresentativePointsDriver.printRepresentativePoints(output, numIterations);
- }
-
- /**
- * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
- *
- * @param dC
- * a double cluster center offset
- * @param dP
- * a double representative point offset
- * @param measure
- * the DistanceMeasure
- */
- private void initData(double dC, double dP, DistanceMeasure measure) {
- clusters = Lists.newArrayList();
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
- representativePoints = Maps.newHashMap();
- for (Cluster cluster : clusters) {
- List<VectorWritable> points = Lists.newArrayList();
- representativePoints.put(cluster.getId(), points);
- points.add(new VectorWritable(cluster.getCenter().clone()));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
- }
- }
-
- @Test
- public void testRepresentativePoints() throws Exception {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Configuration conf = getConfiguration();
- // run using MR reference point calculation
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
- int numIterations = 2;
- Path clustersIn = new Path(output, "clusters-0-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, false);
- printRepPoints(numIterations);
- ClusterEvaluator evaluatorMR = new ClusterEvaluator(conf, clustersIn);
- // now run again using sequential reference point calculation
- HadoopUtil.delete(conf, output);
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, true);
- printRepPoints(numIterations);
- ClusterEvaluator evaluatorSeq = new ClusterEvaluator(conf, clustersIn);
- // compare results
- assertEquals("InterCluster Density", evaluatorMR.interClusterDensity(), evaluatorSeq.interClusterDensity(), EPSILON);
- assertEquals("IntraCluster Density", evaluatorMR.intraClusterDensity(), evaluatorSeq.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCluster0() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCluster1() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.5, measure);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCluster2() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.75, measure);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- /**
- * adding an empty cluster should modify the inter cluster density but not change the intra-cluster density as that
- * cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
- *
- * @throws IOException
- */
- @Test
- public void testEmptyCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = Lists.newArrayList();
- representativePoints.put(cluster.getId(), points);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.371534146934532, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- /**
- * adding an single-valued cluster should modify the inter cluster density but not change the intra-cluster density as
- * that cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
- *
- * @throws IOException
- */
- @Test
- public void testSingleValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = Lists.newArrayList();
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
- representativePoints.put(cluster.getId(), points);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- /**
- * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
- * clusters are included in the inter-cluster density but their NaN intra-density values are ignored by the evaluator.
- *
- * @throws IOException
- */
- @Test
- public void testAllSameValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = Lists.newArrayList();
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- representativePoints.put(cluster.getId(), points);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCanopy() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Configuration conf = getConfiguration();
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(output, "clusters-0-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, true);
- //printRepPoints(numIterations);
- ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
- // now print out the Results
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- }
-
- @Test
- public void testKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- Configuration conf = getConfiguration();
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
- // now run the KMeans job
- Path kmeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(kmeansOutput, "clusters-2");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
- numIterations, true);
- RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
- ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
- // now print out the Results
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- }
-
- @Test
- public void testFuzzyKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- Configuration conf = getConfiguration();
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
- Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
- // now run the KMeans job
- FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
- true, true, 0, true);
- int numIterations = 10;
- Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
- measure, numIterations, true);
- RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
- ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
- // now print out the Results
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
deleted file mode 100644
index 597ed01..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
+++ /dev/null
@@ -1,326 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.cdbw;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.ClusteringTestUtils;
-import org.apache.mahout.clustering.TestClusterEvaluator;
-import org.apache.mahout.clustering.UncommonDistributions;
-import org.apache.mahout.clustering.canopy.Canopy;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class TestCDbwEvaluator extends MahoutTestCase {
-
- private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
-
- private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
-
- private Map<Integer,List<VectorWritable>> representativePoints;
-
- private List<Cluster> clusters;
-
- private Configuration conf;
-
- private FileSystem fs;
-
- private final Collection<VectorWritable> sampleData = new ArrayList<>();
-
- private List<VectorWritable> referenceData = new ArrayList<>();
-
- private Path testdata;
-
- private Path output;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- conf = getConfiguration();
- fs = FileSystem.get(conf);
- testdata = getTestTempDirPath("testdata");
- output = getTestTempDirPath("output");
- // Create small reference data set
- referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
- // generate larger test data set for the clustering tests to chew on
- generateSamples();
- }
-
- /**
- * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
- *
- * @param dC
- * a double cluster center offset
- * @param dP
- * a double representative point offset
- * @param measure
- * the DistanceMeasure
- */
- private void initData(double dC, double dP, DistanceMeasure measure) {
- clusters = new ArrayList<>();
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
- representativePoints = new HashMap<>();
- for (Cluster cluster : clusters) {
- List<VectorWritable> points = new ArrayList<>();
- representativePoints.put(cluster.getId(), points);
- points.add(new VectorWritable(cluster.getCenter().clone()));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
- }
- }
-
- /**
- * Generate random samples and add them to the sampleData
- *
- * @param num
- * int number of samples to generate
- * @param mx
- * double x-value of the sample mean
- * @param my
- * double y-value of the sample mean
- * @param sd
- * double standard deviation of the samples
- */
- private void generateSamples(int num, double mx, double my, double sd) {
- log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
- for (int i = 0; i < num; i++) {
- sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
- UncommonDistributions.rNorm(my, sd)})));
- }
- }
-
- private void generateSamples() {
- generateSamples(500, 1, 1, 3);
- generateSamples(300, 1, 0, 0.5);
- generateSamples(300, 0, 2, 0.1);
- }
-
- @Test
- public void testCDbw0() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testCDbw1() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.5, measure);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testCDbw2() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.75, measure);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testEmptyCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testSingleValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- /**
- * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
- * clusters should be ignored like empty clusters above
- *
- * @throws IOException
- */
- @Test
- public void testAllSameValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- /**
- * Clustering can produce very, very tight clusters that can cause the std calculation to fail. These clusters should
- * be processed correctly.
- *
- * @throws IOException
- */
- @Test
- public void testAlmostSameValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- Vector delta = new DenseVector(new double[] {0, Double.MIN_NORMAL});
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testCanopy() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(output, "clusters-0-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, true);
- CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- // printRepPoints(numIterations);
- // now print out the Results
- System.out.println("Canopy CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
- // now run the KMeans job
- Path kmeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(kmeansOutput, "clusters-10-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
- numIterations, true);
- CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
- // now print out the Results
- System.out.println("K-Means CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testFuzzyKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
- Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
- // now run the KMeans job
- FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
- true, true, 0, true);
- int numIterations = 10;
- Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
- measure, numIterations, true);
- CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
- // now print out the Results
- System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java b/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
deleted file mode 100644
index ba73c82..0000000
--- a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-/**
- * Unit tests for the MailArchivesClusteringAnalyzer text analyzer.
- */
-public class MailArchivesClusteringAnalyzerTest extends MahoutTestCase {
-
- @Test
- public void testAnalysis() throws Exception {
- Analyzer analyzer = new MailArchivesClusteringAnalyzer();
-
- String text = "A test message\n"
- + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
- + "Mahout is a scalable, machine-learning LIBRARY\n"
- + "we've added some additional stopwords such as html, mailto, regards\t"
- + "apache_hadoop provides the foundation for scalability\n"
- + "www.nabble.com general-***@incubator.apache.org\n"
- + "public void int protected package";
- Reader reader = new StringReader(text);
-
- // if you change the text above, then you may need to change this as well
- // order matters too
- String[] expectedTokens = {
- "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
- "stopword", "apache_hadoop","provid", "foundat", "scalabl"
- };
-
- TokenStream tokenStream = analyzer.tokenStream("test", reader);
- assertNotNull(tokenStream);
- tokenStream.reset();
- CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
- int e = 0;
- while (tokenStream.incrementToken() && e < expectedTokens.length) {
- assertEquals(expectedTokens[e++], termAtt.toString());
- }
- assertEquals(e, expectedTokens.length);
- tokenStream.end();
- tokenStream.close();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
deleted file mode 100644
index ef2b8a6..0000000
--- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
+++ /dev/null
@@ -1,240 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.util.zip.GZIPOutputStream;
-
-import org.apache.commons.lang3.SystemUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Test case for the SequenceFilesFromMailArchives command-line application.
- */
-public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase {
-
- private File inputDir;
-
- /**
- * Create the input and output directories needed for testing
- * the SequenceFilesFromMailArchives application.
- */
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- inputDir = getTestTempDir("mail-archives-in");
-
- // write test mail messages to a gzipped file in a nested directory
- File subDir = new File(inputDir, "subdir");
- subDir.mkdir();
- File gzFile = new File(subDir, "mail-messages.gz");
- try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) {
- gzOut.write(testMailMessages.getBytes("UTF-8"));
- gzOut.finish();
- }
-
- File subDir2 = new File(subDir, "subsubdir");
- subDir2.mkdir();
- File gzFile2 = new File(subDir2, "mail-messages-2.gz");
- try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) {
- gzOut.write(testMailMessages.getBytes("UTF-8"));
- gzOut.finish();
- }
- }
-
- @Test
- public void testSequential() throws Exception {
-
- File outputDir = this.getTestTempDir("mail-archives-out");
-
- String[] args = {
- "--input", inputDir.getAbsolutePath(),
- "--output", outputDir.getAbsolutePath(),
- "--charset", "UTF-8",
- "--keyPrefix", "TEST",
- "--method", "sequential",
- "--body", "--subject", "--separator", ""
- };
-
- // run the application's main method
- SequenceFilesFromMailArchives.main(args);
-
- // app should create a single SequenceFile named "chunk-0" in the output dir
- File expectedChunkFile = new File(outputDir, "chunk-0");
- String expectedChunkPath = expectedChunkFile.getAbsolutePath();
- Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile());
-
- Configuration conf = getConfiguration();
- SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf);
- Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
- Pair<Text, Text> record = iterator.next();
-
- File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
- Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
- record = iterator.next();
- Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
- record = iterator.next();
- File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
- record = iterator.next();
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
- Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext());
- }
-
- @Test
- public void testMapReduce() throws Exception {
-
- Path tmpDir = getTestTempDirPath();
- Path mrOutputDir = new Path(tmpDir, "mail-archives-out-mr");
- Configuration configuration = getConfiguration();
- FileSystem fs = FileSystem.get(configuration);
-
- File expectedInputFile = new File(inputDir.toString());
-
- String[] args = {
- "-Dhadoop.tmp.dir=" + configuration.get("hadoop.tmp.dir"),
- "--input", expectedInputFile.getAbsolutePath(),
- "--output", mrOutputDir.toString(),
- "--charset", "UTF-8",
- "--keyPrefix", "TEST",
- "--method", "mapreduce",
- "--body", "--subject", "--separator", ""
- };
-
- // run the application's main method
- SequenceFilesFromMailArchives.main(args);
-
- // app should create a single SequenceFile named "chunk-0" in the output dir
- FileStatus[] fileStatuses = fs.listStatus(mrOutputDir.suffix("/part-m-00000"));
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
- SequenceFileIterator<Text, Text> iterator =
- new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration);
-
- Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
- Pair<Text, Text> record = iterator.next();
-
- File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
-
- String expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), expected);
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
- record = iterator.next();
- expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), expected);
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
- // test other file
- File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
- record = iterator.next();
- expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), expected);
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
- record = iterator.next();
- expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), expected);
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
- Assert.assertFalse("Only four key/value pairs expected!", iterator.hasNext());
- }
-
- // Messages extracted and made anonymous from the ASF mail archives
- private static final String[][] testVars = {
- new String[] {
- "***@example.com",
- "Ant task for JDK1.1 collections build option",
- "\nThis is just a test message\n--\nTesty McTester\n"
- },
- new String[] {
- "***@example.com",
- "Problem with build files in several directories",
- "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n"
- }
- };
-
- private static final String testMailMessages =
- "From ***@example.com Mon Jul 24 19:13:53 2000\n"
- + "Return-Path: <***@example.com>\n"
- + "Mailing-List: contact ant-user-***@jakarta.apache.org; run by ezmlm\n"
- + "Delivered-To: mailing list ant-***@jakarta.apache.org\n"
- + "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n"
- + "Message-ID: <" + testVars[0][0] + ">\n"
- + "From: \"Testy McTester\" <***@example.com>\n"
- + "To: <ant-***@jakarta.apache.org>\n"
- + "Subject: " + testVars[0][1] + '\n'
- + "Date: Mon, 24 Jul 2000 12:24:56 -0700\n"
- + "MIME-Version: 1.0\n"
- + "Content-Type: text/plain;\n"
- + " charset=\"Windows-1252\"\n"
- + "Content-Transfer-Encoding: 7bit\n"
- + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
- + testVars[0][2] + '\n'
- + "From ***@example.com Wed Jul 26 11:32:16 2000\n"
- + "Return-Path: <***@example.com>\n"
- + "Mailing-List: contact ant-user-***@jakarta.apache.org; run by ezmlm\n"
- + "Delivered-To: mailing list ant-***@jakarta.apache.org\n"
- + "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n"
- + "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n"
- + "Date: Wed, 26 Jul 2000 13:32:08 +0200\n"
- + "Subject: " + testVars[1][1] + '\n'
- + "From: Another Test <***@example.com>\n"
- + "To: <ant-***@jakarta.apache.org>\n"
- + "Message-Id: <" + testVars[1][0] + ">\n"
- + "Mime-Version: 1.0\n"
- + "Content-Type: text/plain; charset=\"US-ASCII\"\n"
- + "Content-Transfer-Encoding: 7bit\n"
- + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
- + testVars[1][2];
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java b/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
deleted file mode 100644
index 227521a..0000000
--- a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-
-/**
- * Dummy Path Filter for testing the MapReduce version of
- * SequenceFilesFromDirectory
- */
-public class TestPathFilter implements PathFilter {
-
- @Override
- public boolean accept(Path path) {
- return path.getName().startsWith("t") || path.getName().startsWith("r") || path.getName().startsWith("f");
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java b/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
deleted file mode 100644
index 040c8e4..0000000
--- a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class TestSequenceFilesFromDirectory extends MahoutTestCase {
-
- private static final Logger logger = LoggerFactory.getLogger(TestSequenceFilesFromDirectory.class);
-
- private static final String[][] DATA1 = {
- {"test1", "This is the first text."},
- {"test2", "This is the second text."},
- {"test3", "This is the third text."}
- };
-
- private static final String[][] DATA2 = {
- {"recursive_test1", "This is the first text."},
- {"recursive_test2", "This is the second text."},
- {"recursive_test3", "This is the third text."}
- };
-
- @Test
- public void testSequenceFileFromDirectoryBasic() throws Exception {
- // parameters
- Configuration configuration = getConfiguration();
-
- FileSystem fs = FileSystem.get(configuration);
-
- // create
- Path tmpDir = this.getTestTempDirPath();
- Path inputDir = new Path(tmpDir, "inputDir");
- fs.mkdirs(inputDir);
-
- Path outputDir = new Path(tmpDir, "outputDir");
- Path outputDirRecursive = new Path(tmpDir, "outputDirRecursive");
-
- Path inputDirRecursive = new Path(tmpDir, "inputDirRecur");
- fs.mkdirs(inputDirRecursive);
-
- // prepare input files
- createFilesFromArrays(configuration, inputDir, DATA1);
-
- SequenceFilesFromDirectory.main(new String[]{
- "--input", inputDir.toString(),
- "--output", outputDir.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--keyPrefix", "UID",
- "--method", "sequential"});
-
- // check output chunk files
- checkChunkFiles(configuration, outputDir, DATA1, "UID");
-
- createRecursiveDirFilesFromArrays(configuration, inputDirRecursive, DATA2);
-
- FileStatus fstInputPath = fs.getFileStatus(inputDirRecursive);
- String dirs = HadoopUtil.buildDirList(fs, fstInputPath);
-
- System.out.println("\n\n ----- recursive dirs: " + dirs);
- SequenceFilesFromDirectory.main(new String[]{
- "--input", inputDirRecursive.toString(),
- "--output", outputDirRecursive.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--keyPrefix", "UID",
- "--method", "sequential"});
-
- checkRecursiveChunkFiles(configuration, outputDirRecursive, DATA2, "UID");
- }
-
- @Test
- public void testSequenceFileFromDirectoryMapReduce() throws Exception {
-
- Configuration conf = getConfiguration();
-
- FileSystem fs = FileSystem.get(conf);
-
- // create
- Path tmpDir = this.getTestTempDirPath();
- Path inputDir = new Path(tmpDir, "inputDir");
- fs.mkdirs(inputDir);
-
- Path inputDirRecur = new Path(tmpDir, "inputDirRecur");
- fs.mkdirs(inputDirRecur);
-
- Path mrOutputDir = new Path(tmpDir, "mrOutputDir");
- Path mrOutputDirRecur = new Path(tmpDir, "mrOutputDirRecur");
-
- createFilesFromArrays(conf, inputDir, DATA1);
-
- SequenceFilesFromDirectory.main(new String[]{
- "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
- "--input", inputDir.toString(),
- "--output", mrOutputDir.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--method", "mapreduce",
- "--keyPrefix", "UID",
- "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
- });
-
- checkMRResultFiles(conf, mrOutputDir, DATA1, "UID");
-
- createRecursiveDirFilesFromArrays(conf, inputDirRecur, DATA2);
-
- FileStatus fst_input_path = fs.getFileStatus(inputDirRecur);
- String dirs = HadoopUtil.buildDirList(fs, fst_input_path);
-
- logger.info("\n\n ---- recursive dirs: {}", dirs);
-
- SequenceFilesFromDirectory.main(new String[]{
- "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
- "--input", inputDirRecur.toString(),
- "--output", mrOutputDirRecur.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--method", "mapreduce",
- "--keyPrefix", "UID",
- "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
- });
-
- checkMRResultFilesRecursive(conf, mrOutputDirRecur, DATA2, "UID");
- }
-
-
- private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException {
- FileSystem fs = FileSystem.get(conf);
- for (String[] aData : data) {
- try (OutputStreamWriter writer =
- new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8)){
- writer.write(aData[1]);
- }
- }
- }
-
- private static void createRecursiveDirFilesFromArrays(Configuration configuration, Path inputDir,
- String[][] data) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- logger.info("creativeRecursiveDirFilesFromArrays > based on: {}", inputDir.toString());
- Path curPath;
- String currentRecursiveDir = inputDir.toString();
-
- for (String[] aData : data) {
- currentRecursiveDir += "/" + aData[0];
- File subDir = new File(currentRecursiveDir);
- subDir.mkdir();
-
- curPath = new Path(subDir.toString(), "file.txt");
- logger.info("Created file: {}", curPath.toString());
-
- try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(curPath), Charsets.UTF_8)){
- writer.write(aData[1]);
- }
- }
- }
-
- private static void checkChunkFiles(Configuration configuration,
- Path outputDir,
- String[][] data,
- String prefix) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("chunk-0", fileStatuses[0].getPath().getName());
-
- Map<String, String> fileToData = new HashMap<>();
- for (String[] aData : data) {
- fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator =
- new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)){
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- }
- }
- }
-
- private static void checkRecursiveChunkFiles(Configuration configuration,
- Path outputDir,
- String[][] data,
- String prefix) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- System.out.println(" ----------- check_Recursive_ChunkFiles ------------");
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("chunk-0", fileStatuses[0].getPath().getName());
-
-
- Map<String, String> fileToData = new HashMap<>();
- String currentPath = prefix;
- for (String[] aData : data) {
- currentPath += Path.SEPARATOR + aData[0];
- fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator =
- new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)) {
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
- System.out.printf("%s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
-
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(), record.getSecond().toString());
- }
- }
- }
-
- private static void checkMRResultFiles(Configuration conf, Path outputDir,
- String[][] data, String prefix) throws IOException {
- FileSystem fs = FileSystem.get(conf);
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
- Map<String, String> fileToData = new HashMap<>();
- for (String[] aData : data) {
- System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]);
- fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
- fileStatuses[0].getPath(), true, conf)) {
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
-
- System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- }
- }
- }
-
- private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir,
- String[][] data, String prefix) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
- Map<String, String> fileToData = new HashMap<>();
- String currentPath = prefix;
-
- for (String[] aData : data) {
- currentPath += Path.SEPARATOR + aData[0];
- fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
- fileStatuses[0].getPath(), true, configuration)){
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim());
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
deleted file mode 100644
index 7483b2d..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-
-/**
- * Used for testing lucene2seq
- */
-@Deprecated
-public class MultipleFieldsDocument extends SingleFieldDocument {
-
- public static final String FIELD1 = "field1";
- public static final String FIELD2 = "field2";
-
- private String field1;
- private String field2;
-
- public MultipleFieldsDocument(String id, String field, String field1, String field2) {
- super(id, field);
- this.field1 = field1;
- this.field2 = field2;
- }
-
- public String getField1() {
- return field1;
- }
-
- public String getField2() {
- return field2;
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = super.asLuceneDocument();
-
- document.add(new TextField(FIELD1, this.field1, Field.Store.YES));
- document.add(new TextField(FIELD2, this.field2, Field.Store.YES));
-
- return document;
- }
-}

r***@apache.org

2018-06-27 14:51:50 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala
deleted file mode 100644
index 0fbe8ac..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-trait Model extends Serializable {
-
- var summary: String = ""
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala
deleted file mode 100644
index bf85dee..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-import org.apache.mahout.math.drm.DrmLike
-
-trait SupervisedFitter[K, M <: SupervisedModel[K]] extends Fitter {
-
- def fit(drmX : DrmLike[K],
- drmTarget: DrmLike[K],
- hyperparameters: (Symbol, Any)*): M
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala
deleted file mode 100644
index 57c20e7..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-import scala.collection.mutable
-
-trait SupervisedModel[K] extends Model {
- var testResults: mutable.Map[Symbol, Any] = mutable.Map[Symbol, Any]()
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala
deleted file mode 100644
index 5c191d1..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-import org.apache.mahout.math.drm.DrmLike
-
-trait UnsupervisedFitter extends Fitter {
-
- def fit[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): UnsupervisedModel
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala
deleted file mode 100644
index f8ff341..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala
+++ /dev/null
@@ -1,24 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-trait UnsupervisedModel extends Model {
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala
deleted file mode 100644
index 8f287b0..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala
+++ /dev/null
@@ -1,157 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.clustering
-
-
-
-import org.apache.mahout.math.algorithms.common.distance.{DistanceMetric, DistanceMetricSelector}
-import org.apache.mahout.math._
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.function.VectorFunction
-import org.apache.mahout.math.scalabindings._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.math.{Matrix, Vector}
-
-
-class CanopyClusteringModel(canopies: Matrix, dm: Symbol) extends ClusteringModel {
-
- val canopyCenters = canopies
- val distanceMetric = dm
-
- def cluster[K](input: DrmLike[K]): DrmLike[K] = {
-
- implicit val ctx = input.context
- implicit val ktag = input.keyClassTag
-
- val bcCanopies = drmBroadcast(canopyCenters)
- val bcDM = drmBroadcast(dvec(DistanceMetricSelector.namedMetricLookup(distanceMetric)))
-
- input.mapBlock(1) {
- case (keys, block: Matrix) => {
- val outputMatrix = new DenseMatrix(block.nrow, 1)
-
- val localCanopies: Matrix = bcCanopies.value
- for (i <- 0 until block.nrow) {
- val distanceMetric = DistanceMetricSelector.select(bcDM.value.get(0))
-
- val cluster = (0 until localCanopies.nrow).foldLeft(-1, 9999999999999999.9)((l, r) => {
- val dist = distanceMetric.distance(localCanopies(r, ::), block(i, ::))
- if ((dist) < l._2) {
- (r, dist)
- }
- else {
- l
- }
- })._1
- outputMatrix(i, ::) = dvec(cluster)
- }
- keys -> outputMatrix
- }
- }
- }
-}
-
-
-class CanopyClustering extends ClusteringFitter {
-
- var t1: Double = _ // loose distance
- var t2: Double = _ // tight distance
- var t3: Double = _
- var t4: Double = _
- var distanceMeasure: Symbol = _
-
- def setStandardHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = {
- t1 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t1, 0.5)
- t2 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t2, 0.1)
- t3 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t3, t1)
- t4 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t4, t2)
-
- distanceMeasure = hyperparameters.asInstanceOf[Map[Symbol, Symbol]].getOrElse('distanceMeasure, 'Cosine)
-
- }
-
- def fit[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): CanopyClusteringModel = {
-
- setStandardHyperparameters(hyperparameters.toMap)
- implicit val ctx = input.context
- implicit val ktag = input.keyClassTag
-
- val dmNumber = DistanceMetricSelector.namedMetricLookup(distanceMeasure)
-
- val distanceBC = drmBroadcast(dvec(t1,t2,t3,t4, dmNumber))
- val canopies = input.allreduceBlock(
- {
-
- // Assign All Points to Clusters
- case (keys, block: Matrix) => {
- val t1_local = distanceBC.value.get(0)
- val t2_local = distanceBC.value.get(1)
- val dm = distanceBC.value.get(4)
- CanopyFn.findCenters(block, DistanceMetricSelector.select(dm), t1_local, t2_local)
- }
- }, {
- // Optionally Merge Clusters that are close enough
- case (oldM: Matrix, newM: Matrix) => {
- val t3_local = distanceBC.value.get(2)
- val t4_local = distanceBC.value.get(3)
- val dm = distanceBC.value.get(4)
- CanopyFn.findCenters(oldM, DistanceMetricSelector.select(dm), t3_local, t4_local)
- }
- })
-
- val model = new CanopyClusteringModel(canopies, distanceMeasure)
- model.summary = s"""CanopyClusteringModel\n${canopies.nrow} Clusters\n${distanceMeasure} distance metric used for calculating distances\nCanopy centers stored in model.canopies where row n coresponds to canopy n"""
- model
- }
-
-
-}
-
-object CanopyFn extends Serializable {
- def findCenters(block: Matrix, distanceMeasure: DistanceMetric, t1: Double, t2: Double): Matrix = {
- var rowAssignedToCanopy = Array.fill(block.nrow) { false }
- val clusterBuf = scala.collection.mutable.ListBuffer.empty[org.apache.mahout.math.Vector]
- while (rowAssignedToCanopy.contains(false)) {
- val rowIndexOfNextUncanopiedVector = rowAssignedToCanopy.indexOf(false)
- clusterBuf += block(rowIndexOfNextUncanopiedVector, ::).cloned
- block(rowIndexOfNextUncanopiedVector, ::) = svec(Nil, cardinality = block.ncol)
- rowAssignedToCanopy(rowIndexOfNextUncanopiedVector) = true
- for (i <- 0 until block.nrow) {
- if (block(i, ::).getNumNonZeroElements > 0) { //
- distanceMeasure.distance(block(i, ::), clusterBuf.last) match {
- case d if d < t2 => {
-
- rowAssignedToCanopy(i) = true
- block(i, ::) = svec(Nil, cardinality = block.ncol)
- }
- case d if d < t1 => {
-
- rowAssignedToCanopy(i) = true
- }
- case d => {}
- }
- }
- }
- }
- dense(clusterBuf)
- }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala
deleted file mode 100644
index 8ab1170..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.clustering
-
-import org.apache.mahout.math.algorithms.{UnsupervisedFitter, UnsupervisedModel}
-import org.apache.mahout.math.drm.DrmLike
-
-trait ClusteringModel extends UnsupervisedModel {
-
- def cluster[K](input: DrmLike[K]): DrmLike[K]
-
-}
-
-trait ClusteringFitter extends UnsupervisedFitter {
-
- def fit[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): ClusteringModel
-
- def fitCluster[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): DrmLike[K] = {
- model = this.fit(input, hyperparameters:_*)
- model.cluster(input)
-
- }
-
- // used to store the model if `fitTransform` method called
- var model: ClusteringModel = _
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala
deleted file mode 100644
index 00495fd..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.algorithms.common.distance
-
-import org.apache.mahout.math.function.Functions
-import org.apache.mahout.math.{CardinalityException, Vector}
-
-trait DistanceMetric extends Serializable {
- def distance(v1: Vector, v2: Vector): Double
-}
-
-
-object DistanceMetricSelector extends Serializable{
-
- val namedMetricLookup = Map('Chebyshev -> 1.0, 'Cosine -> 2.0)
-
- def select(dm: Double): DistanceMetric = {
- dm match {
- case 1.0 => Chebyshev
- case 2.0 => Cosine
- }
- }
-}
-
-object Chebyshev extends DistanceMetric {
- def distance(v1: Vector, v2: Vector): Double = {
- if (v1.size != v2.size) throw new CardinalityException(v1.size, v2.size)
- v1.aggregate(v2, Functions.MAX_ABS, Functions.MINUS)
- }
-}
-
-object Cosine extends DistanceMetric {
- def distance(v1: Vector, v2: Vector): Double = 1.0 - v1.dot(v2) / (Math.sqrt(v1.getLengthSquared) * Math.sqrt(v2.getLengthSquared))
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala
deleted file mode 100644
index 2e2a3dd..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala
+++ /dev/null
@@ -1,129 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.preprocessing
-
-
-
-import collection._
-import JavaConversions._
-import org.apache.mahout.math._
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.{Vector => MahoutVector}
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.scalabindings._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import MahoutCollections._
-
-class AsFactor extends PreprocessorFitter {
-
- def fit[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): AsFactorModel = {
-
- import org.apache.mahout.math.function.VectorFunction
- val factorMap = input.allreduceBlock(
- { case (keys, block: Matrix) => block },
- { case (oldM: Matrix, newM: Matrix) =>
- // someday we'll replace this with block.max: Vector
- // or better yet- block.distinct
-
- dense((oldM rbind newM).aggregateColumns( new VectorFunction {
- def apply(f: Vector): Double = f.max
- }))
- })(0, ::)
- /*
- val A = drmParallelize(dense(
- (3, 2, 1),
- (0, 0, 0),
- (1, 1, 1))
- -> (4,2,2), now 4,3,2
- */
- new AsFactorModel(factorMap.sum.toInt,
- dvec(factorMap.toArray.scanLeft(0.0)((l, r) => l + r ).take(factorMap.length))
- // factorMap
- )
- }
-
-}
-
-class AsFactorModel(cardinality: Int, factorVec: MahoutVector) extends PreprocessorModel {
-
- val factorMap: MahoutVector = factorVec
-
- def transform[K](input: DrmLike[K]): DrmLike[K] ={
-
- implicit val ctx = input.context
-
- val bcastK = drmBroadcast(dvec(cardinality))
- val bcastFactorMap = drmBroadcast(factorMap)
-
- implicit val ktag = input.keyClassTag
-
- val res = input.mapBlock(cardinality) {
- case (keys, block: Matrix) => {
- val cardinality: Int = bcastK.value.get(0).toInt
- val output = new SparseMatrix(block.nrow, cardinality)
- // This is how we take a vector of mapping to a map
- val fm = bcastFactorMap.value
- for (n <- 0 until output.nrow){
- var m = 0
- for (e <- block(n, ::).all() ){
- output(n, fm.get(m).toInt + e.get().toInt ) = 1.0
- m += 1
- }
- }
- (keys, output)
- }
- }
- res
- }
-
- override def invTransform[K](input: DrmLike[K]): DrmLike[K] = {
- implicit val ctx = input.context
-
- val bcastK = drmBroadcast(dvec(cardinality))
- val bcastFactorMap = drmBroadcast(factorMap)
-
- implicit val ktag = input.keyClassTag
-
- val res = input.mapBlock(cardinality) {
- case (keys, block: Matrix) => {
- val k: Int = bcastK.value.get(0).toInt
- val output = new DenseMatrix(block.nrow, bcastK.value.length)
- // This is how we take a vector of mapping to a map
- val fm = bcastFactorMap.all.toSeq.map(e => e.get -> e.index).toMap
-
- import MahoutCollections._
- val indexArray = Array(1.0) ++ bcastFactorMap.value.toArray.map(i => i.toInt)
- for (n <- 0 until output.nrow){
- val v = new DenseVector(bcastFactorMap.value.length)
- var m = 0
- for (e <- block(n, ::).asInstanceOf[RandomAccessSparseVector].iterateNonZero() ){
- v.setQuick(m, e.index - m)
- m += 1
- }
- output(n, ::) = v
- }
- (keys, output)
- }
- }
- res
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala
deleted file mode 100644
index 258ad1b..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.preprocessing
-
-import collection._
-import JavaConversions._
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.Matrix
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.math.{Vector => MahoutVector}
-
-
-
-class MeanCenter extends PreprocessorFitter {
-
- /**
- * Centers Columns at zero or centers
- * @param input A drm which to center on
- *
- */
- def fit[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): MeanCenterModel = {
- new MeanCenterModel(input.colMeans()) // could add centers here
- }
-
-}
-
-/**
- * A model for mean centering each column of a data set at 0 or some number specified by the setCenters method.
- * @param means
- */
-class MeanCenterModel(means: MahoutVector) extends PreprocessorModel {
-
- var colCentersV: MahoutVector = means
-
- def setCenters(centers: MahoutVector): Unit = {
- if (means.length != centers.length){
- throw new Exception(s"Length of centers vector (${centers.length}) must equal length of means vector ((${means.length}) (e.g. the number of columns in the orignally fit input).")
- }
- colCentersV = means + centers
- }
- def transform[K](input: DrmLike[K]): DrmLike[K] = {
-
- implicit val ctx = input.context
- implicit val ktag = input.keyClassTag
-
- val bcastV = drmBroadcast(colCentersV)
-
- val output = input.mapBlock(input.ncol) {
- case (keys, block: Matrix) =>
- val copy: Matrix = block.cloned
- copy.foreach(row => row -= bcastV.value)
- (keys, copy)
- }
- output
- }
-
- def invTransform[K](input: DrmLike[K]): DrmLike[K] = {
-
- implicit val ctx = input.context
- implicit val ktag = input.keyClassTag
- val bcastV = drmBroadcast(colCentersV)
-
- val output = input.mapBlock(input.ncol) {
- case (keys, block: Matrix) =>
- val copy: Matrix = block.cloned
- copy.foreach(row => row += bcastV.value)
- (keys, copy)
- }
- output
- }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala
deleted file mode 100644
index 5adb87d..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.preprocessing
-
-import org.apache.mahout.math.algorithms.{UnsupervisedFitter, UnsupervisedModel}
-import org.apache.mahout.math.drm.DrmLike
-
-trait PreprocessorModel extends UnsupervisedModel {
-
- /**
- * A convenience method for returning transformed data back to original
- * @param input
- * @tparam K
- * @return
- */
- def invTransform[K](input: DrmLike[K]): DrmLike[K]
-
- /**
- * Transform given Drm given the feature set
- * @param input
-
- */
- def transform[K](input: DrmLike[K]): DrmLike[K]
-
-}
-
-trait PreprocessorFitter extends UnsupervisedFitter {
-
- def fit[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): PreprocessorModel
-
- def fitTransform[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): DrmLike[K] = {
- model = this.fit(input, hyperparameters:_*)
- model.transform(input)
-
- }
-
- // used to store the model if `fitTransform` method called
- var model: PreprocessorModel = _
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala
deleted file mode 100644
index 5863330..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.preprocessing
-
-import collection._
-import JavaConversions._
-
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.math.{Vector => MahoutVector, Matrix}
-
-/**
- * Scales columns to mean 0 and unit variance
- *
- * An important note- The equivelent call in R would be something like
- * ```r
- * N <- nrow(x)
- * scale(x, scale= apply(x, 2, sd) * sqrt(N-1/N))
- * ```
- *
- * This is because R uses degrees of freedom = 1 to calculate standard deviation.
- * Multiplying the standard deviation by sqrt(N-1/N) 'undoes' this correction.
- *
- * The StandardScaler of sklearn uses degrees of freedom = 0 for its calculation, so results
- * should be similar.
- */
-class StandardScaler extends PreprocessorFitter {
-
- def fit[K](input: DrmLike[K],
- hyperparameters: (Symbol, Any)*): StandardScalerModel = {
- val mNv = dcolMeanVars(input)
- new StandardScalerModel(mNv._1, mNv._2.sqrt)
- }
-
-}
-
-class StandardScalerModel(val meanVec: MahoutVector,
- val stdev: MahoutVector
- ) extends PreprocessorModel {
-
-
- def transform[K](input: DrmLike[K]): DrmLike[K] = {
- implicit val ctx = input.context
-
- // Some mapBlock() calls need it
- // implicit val ktag = input.keyClassTag
-
- val bcastMu = drmBroadcast(meanVec)
- val bcastSigma = drmBroadcast(stdev)
-
- implicit val ktag = input.keyClassTag
-
- val res = input.mapBlock(input.ncol) {
- case (keys, block: Matrix) => {
- val copy: Matrix = block.cloned
- copy.foreach(row => row := (row - bcastMu) / bcastSigma )
- (keys, copy)
- }
- }
- res
- }
-
- /**
- * Given a an output- trasform it back into the original
- * e.g. a normalized column, back to original values.
- *
- * @param input
- * @tparam K
- * @return
- */
- def invTransform[K](input: DrmLike[K]): DrmLike[K] = { // [K: ClassTag]
-
- implicit val ctx = input.context
-
- // Some mapBlock() calls need it
- implicit val ktag = input.keyClassTag
-
- val bcastMu = drmBroadcast(meanVec)
- val bcastSigma = drmBroadcast(stdev)
-
- val res = input.mapBlock(input.ncol) {
- case (keys, block: Matrix) => {
- val copy: Matrix = block.cloned
- copy.foreach(row => row := (row * bcastSigma ) + bcastMu)
- (keys, copy)
- }
- }
- res
- }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
deleted file mode 100644
index 3e5a496..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.regression
-
-import org.apache.mahout.math.algorithms.regression.tests._
-import org.apache.mahout.math.drm.{CacheHint, DrmLike, safeToNonNegInt}
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.function.Functions
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.math.{Vector => MahoutVector}
-
-
-class CochraneOrcuttModel[K](regressor: LinearRegressorModel[K]) extends LinearRegressorModel[K] {
- // https://en.wikipedia.org/wiki/Cochrane%E2%80%93Orcutt_estimation
-
- var betas: Array[MahoutVector] = _
- var dws: Array[Double] = _
- var rhos: Array[Double] = _
-
- def predict(drmPredictors: DrmLike[K]): DrmLike[K] = {
- regressor.predict(drmPredictors)
- }
-
-}
-
-class CochraneOrcutt[K](hyperparameters: (Symbol, Any)*) extends LinearRegressorFitter[K] {
-
- var regressor: LinearRegressorFitter[K] = _
- var iterations: Int = _
- var cacheHint: CacheHint.CacheHint = _
- // For larger inputs, CacheHint.MEMORY_AND_DISK2 is reccomended.
-
- def setHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = {
- setStandardHyperparameters(hyperparameters.toMap)
- regressor = hyperparameters.asInstanceOf[Map[Symbol, LinearRegressorFitter[K]]].getOrElse('regressor, new OrdinaryLeastSquares())
- regressor.calcStandardErrors = false
- regressor.calcCommonStatistics = false
- iterations = hyperparameters.asInstanceOf[Map[Symbol, Int]].getOrElse('iterations, 3)
- cacheHint = hyperparameters.asInstanceOf[Map[Symbol, CacheHint.CacheHint]].getOrElse('cacheHint, CacheHint.MEMORY_ONLY)
- }
-
- setHyperparameters(hyperparameters.toMap)
-
- def calculateRho(errorDrm: DrmLike[K]): Double ={
- val error = errorDrm.collect.viewColumn(0)
- val n = error.length - 1
- val e2: MahoutVector = error.viewPart(1, n)
- val e3: MahoutVector = error.viewPart(0, n)
- // regression through the origin lm(e2 ~e3 -1) is sum(e2 * e3) / e3^2
- e3.times(e2).sum / e3.assign(Functions.SQUARE).sum
- }
-
- def fit(drmFeatures: DrmLike[K], drmTarget: DrmLike[K], hyperparameters: (Symbol, Any)*): CochraneOrcuttModel[K] = {
-
- setHyperparameters(hyperparameters.toMap[Symbol, Any])
-
- val betas = new Array[MahoutVector](iterations)
- val models = new Array[LinearRegressorModel[K]](iterations)
- val dws = new Array[Double](iterations)
- val rhos = new Array[Double](iterations)
-
- val n = safeToNonNegInt(drmTarget.nrow)
- val Y = drmTarget(1 until n, 0 until 1).checkpoint(cacheHint)
- val Y_lag = drmTarget(0 until n - 1, 0 until 1).checkpoint(cacheHint)
- val X = drmFeatures(1 until n, 0 until drmFeatures.ncol).checkpoint(cacheHint)
- val X_lag = drmFeatures(0 until n - 1, 0 until drmFeatures.ncol).checkpoint(cacheHint)
-
- // Step 1: Normal Regression
- regressor.calcStandardErrors = true
- regressor.calcCommonStatistics = true
- models(0) = regressor.fit(drmFeatures, drmTarget)
- regressor.calcStandardErrors = false
- regressor.calcCommonStatistics = false
- betas(0) = models(0).beta
- var residuals = drmTarget - models(0).predict(drmFeatures)
-
- for (i <- 1 until iterations){
- // Step 2: Calculate Rho
- val rho_hat = calculateRho(residuals)
- rhos(i-1) = rho_hat
-
- // Step 3: Transform Variables
- val drmYprime = Y - (Y_lag * rho_hat)
- val drmXprime = X - (X_lag * rho_hat)
-
- // Step 4: Get Estimates of Transformed Equation
- if (i == iterations - 1 ){
- // get standard errors on last iteration only
- regressor.calcStandardErrors = true
- regressor.calcCommonStatistics = true
- }
- models(i) = regressor.fit(drmXprime, drmYprime)
- // Make this optional- only for parity with R reported dw-stat, doesn't really mean anything
- dws(i) = AutocorrelationTests.DurbinWatson( models(i),
- drmTarget - models(i).predict(drmFeatures))
- .testResults.get('durbinWatsonTestStatistic).get.asInstanceOf[Double]
-
- models(i).beta(X.ncol) = models(i).beta(X.ncol) / (1 - rho_hat) // intercept adjust
- betas(i) = models(i).beta
-
- // Step 5: Use Betas from (4) to recalculate model from (1)
- residuals = drmTarget - models(i).predict(drmFeatures)
-
- /** Step 6: repeat Step 2 through 5 until a stopping criteria is met.
- * some models call for convergence-
- * Kunter et. al reccomend 3 iterations, if you don't achieve desired results, use
- * an alternative method.
- **/
- }
-
- var finalModel = new CochraneOrcuttModel[K](models(iterations -1))
- finalModel.betas = betas
- finalModel.dws = dws
- finalModel.rhos = rhos
- finalModel.tScore = models(iterations -1).tScore
- finalModel.pval = models(iterations -1).pval
- finalModel.beta = models(iterations -1).beta
- val se = models(iterations -1).se
- se(se.length -1) = se(se.length -1) / (1 - rhos(iterations - 2))
- finalModel.se = se
- finalModel.summary = "Original Model:\n" + models(0).summary +
- "\n\nTransformed Model:\n" +
- generateSummaryString(finalModel) +
- "\n\nfinal rho: " + finalModel.rhos(iterations - 2) +
- s"\nMSE: ${models(iterations -1 ).mse}\nR2: ${models(iterations -1 ).r2}\n"
-
- if (models(0).addIntercept == true){
- finalModel.summary = finalModel.summary.replace(s"X${X.ncol}", "(Intercept)")
- }
-
- finalModel
- }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
deleted file mode 100644
index 7b87a1a..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
+++ /dev/null
@@ -1,178 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.regression
-
-import org.apache.mahout.math.algorithms.regression.tests.FittnessTests
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.drm.DrmLike
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.scalabindings.dvec
-import org.apache.mahout.math.{Matrix, Vector => MahoutVector}
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.commons.math3.distribution._
-
-import scala.language.higherKinds
-
-trait LinearRegressorModel[K] extends RegressorModel[K] {
-
- var beta: MahoutVector = _
- var se: MahoutVector = _
- var tScore: MahoutVector = _
- var pval: MahoutVector = _
-
-
-
-}
-
-trait LinearRegressorFitter[K] extends RegressorFitter[K] {
-
- var calcStandardErrors: Boolean = _
- var calcCommonStatistics: Boolean = _
-
- def fit(drmX: DrmLike[K],
- drmTarget: DrmLike[K],
- hyperparameters: (Symbol, Any)*): LinearRegressorModel[K]
-
-
- def setStandardHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = {
- calcCommonStatistics = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('calcCommonStatistics, true)
- calcStandardErrors = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('calcStandardErrors, true)
- addIntercept = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('addIntercept, true)
- }
-
-
- def calculateStandardError[M[K] <: LinearRegressorModel[K]](X: DrmLike[K],
- drmTarget: DrmLike[K],
- drmXtXinv: Matrix,
- model: M[K]): M[K] = {
- import org.apache.mahout.math.function.Functions.SQRT
- import org.apache.mahout.math.scalabindings.MahoutCollections._
-
- val yhat = X %*% model.beta
- val residuals = drmTarget - yhat
-
- // Setting modelOut.rss
- // Changed name from ete, to rssModel. This is residual sum of squares for model of yhat vs y
- var modelOut = FittnessTests.calculateResidualSumOfSquares(model,residuals)
-
- val n = drmTarget.nrow
- val k = safeToNonNegInt(X.ncol)
- val invDegFreedomKindOf = 1.0 / (n - k)
- val varCovarMatrix = invDegFreedomKindOf * modelOut.rss * drmXtXinv
- val se = varCovarMatrix.viewDiagonal.assign(SQRT)
- val tScore = model.beta / se
- val tDist = new TDistribution(n-k)
-
- val pval = dvec(tScore.toArray.map(t => 2 * (1.0 - tDist.cumulativeProbability(Math.abs(t))) ))
-
- // ^^ TODO bug in this calculation- fix and add test
- //degreesFreedom = k
- modelOut.se = se
- modelOut.tScore = tScore
- modelOut.pval = pval
- modelOut.degreesOfFreedom = safeToNonNegInt(X.ncol)
- modelOut.trainingExamples = safeToNonNegInt(n)
-
- if (calcCommonStatistics){
- modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals)
- }
-
- // Let Statistics Get Calculated prior to assigning the summary
- modelOut.summary = generateSummaryString(modelOut)
-
- modelOut
- }
-
-
- def calculateCommonStatistics[M[K] <: LinearRegressorModel[K]](model: M[K],
- drmTarget: DrmLike[K],
- residuals: DrmLike[K]): M[K] ={
- var modelOut = model
- modelOut = FittnessTests.CoefficientOfDetermination(model, drmTarget, residuals)
- modelOut = FittnessTests.MeanSquareError(model, residuals)
- modelOut = FittnessTests.FTest(model, drmTarget)
-
-
- modelOut
- }
-
- def modelPostprocessing[M[K] <: LinearRegressorModel[K]](model: M[K],
- X: DrmLike[K],
- drmTarget: DrmLike[K],
- drmXtXinv: Matrix): M[K] = {
- var modelOut = model
- if (calcStandardErrors) {
- modelOut = calculateStandardError(X, drmTarget, drmXtXinv, model )
- } else {
- modelOut.summary = "Coef.\t\tEstimate\n" +
- (0 until X.ncol).map(i => s"X${i}\t${modelOut.beta(i)}").mkString("\n")
- if (calcCommonStatistics) { // we do this in calcStandard errors to avoid calculating residuals twice
- val residuals = drmTarget - (X %*% modelOut.beta)
- // If rss is already set, then this will drop through to calculateCommonStatistics
- modelOut = FittnessTests.calculateResidualSumOfSquares(modelOut,residuals)
- modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals)
- }
-
- modelOut
- }
-
- if (addIntercept) {
- model.summary.replace(s"X${X.ncol - 1}", "(Intercept)")
- model.addIntercept = true
- }
- model
- }
-
- def generateSummaryString[M[K] <: LinearRegressorModel[K]](model: M[K]): String = {
-
- /* Model after R implementation ...
- Call:
- lm(formula = target ~ a + b + c + d, data = df1)
-
- Residuals:
- 1 2 3 4 5 6 7 8 9
- -4.2799 0.5059 -2.2783 4.3765 -1.3455 0.7202 -1.8063 1.2889 2.8184
-
- Coefficients:
- Estimate Std. Error t value Pr(>|t|)
- (Intercept) 163.179 51.915 3.143 0.0347 *
- a -1.336 2.688 -0.497 0.6452
- b -13.158 5.394 -2.439 0.0713 .
- c -4.153 1.785 -2.327 0.0806 .
- d -5.680 1.887 -3.010 0.0395 *
- ---
- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
- */
-
- val k = model.beta.length
-
- // Using Formatted Print here to pretty print the columns
- var summaryString = "\nCoef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" +
- (0 until k).map(i => "X%-3d\t\t%+5.5f\t\t%+5.5f\t\t%+5.5f\t\t%+5.5f".format(i,model.beta(i),model.se(i),model.tScore(i),model.pval(i))).mkString("\n")
- if(calcCommonStatistics) {
- summaryString += "\nF-statistic: " + model.fScore + " on " + (model.degreesOfFreedom - 1) + " and " +
- (model.trainingExamples - model.degreesOfFreedom) + " DF, p-value: " + 0.009545 + "\n"
- summaryString += s"\nMean Squared Error: ${model.mse}"
- summaryString += s"\nR^2: ${model.r2}"
-
- }
- summaryString
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
deleted file mode 100644
index fd9924e..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.regression
-
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.drm.DrmLike
-import org.apache.mahout.math.scalabindings._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-
-class OrdinaryLeastSquaresModel[K]
- extends LinearRegressorModel[K] {
- // https://en.wikipedia.org/wiki/Ordinary_least_squares
-
- def predict(drmPredictors: DrmLike[K]): DrmLike[K] = {
- var X = drmPredictors
- if (addIntercept) {
- X = X cbind 1
- }
- X %*% beta
- }
-
-}
-
-class OrdinaryLeastSquares[K] extends LinearRegressorFitter[K] {
-
-
- def fit(drmFeatures: DrmLike[K],
- drmTarget: DrmLike[K],
- hyperparameters: (Symbol, Any)*): OrdinaryLeastSquaresModel[K] = {
-
- assert(drmTarget.ncol == 1, s"drmTarget must be a single column matrix, found ${drmTarget.ncol} columns")
- var model = new OrdinaryLeastSquaresModel[K]()
- setStandardHyperparameters(hyperparameters.toMap)
-
-
- if (drmFeatures.nrow != drmTarget.nrow){
- throw new Exception(s"${drmFeatures.nrow} observations in features, ${drmTarget.nrow} observations in target, must be equal.")
- }
-
- var X = drmFeatures
-
- if (addIntercept) {
- X = X cbind 1
- }
-
- val XtX = (X.t %*% X).collect
- val drmXtXinv = solve(XtX)
- val drmXty = (X.t %*% drmTarget).collect // this fails when number of columns^2 size matrix won't fit in driver
- model.beta = (drmXtXinv %*% drmXty)(::, 0)
-
-
- this.modelPostprocessing(model, X, drmTarget, drmXtXinv)
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
deleted file mode 100644
index aa3dad4..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.regression
-
-import org.apache.mahout.math.algorithms.{SupervisedFitter, SupervisedModel}
-import org.apache.mahout.math.drm.DrmLike
-
-trait RegressorModel[K] extends SupervisedModel[K] {
-
- def predict(drmPredictors: DrmLike[K]): DrmLike[K]
-
- var addIntercept: Boolean = _
- // Common Applicable Tests- here only for convenience.
- var mse: Double = _
- var r2: Double = _
- var fpval: Double = _
- // default rss to a negative number to ensure rss gets set.
- var rss:Double = -9999.0
- var fScore: Double = _
- var degreesOfFreedom: Int = _
- var trainingExamples :Int = _
-
- /**
- * Syntatictic sugar for fetching test results. Will Return test result if it exists, otherwise None
- * @param testSymbol - symbol of the test result to fetch, e.g. `'mse`
- * @tparam T - The Type
- * @return
- */
- def getTestResult[T](testSymbol: Symbol): Option[T] = {
- Some(testResults.get(testSymbol).asInstanceOf[T])
- }
-}
-
-trait RegressorFitter[K] extends SupervisedFitter[K, RegressorModel[K]] {
-
- var addIntercept: Boolean = _
-
- def fitPredict(drmX: DrmLike[K],
- drmTarget: DrmLike[K],
- hyperparameters: (Symbol, Any)* ): DrmLike[K] = {
-
- model = this.fit(drmX, drmTarget, hyperparameters: _* )
- model.predict(drmX)
- }
-
- // used to store the model if `fitTransform` method called
- var model: RegressorModel[K] = _
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala
deleted file mode 100644
index 2b16b74..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.regression.tests
-
-import org.apache.mahout.math.algorithms.regression.RegressorModel
-import org.apache.mahout.math.drm._
-import org.apache.mahout.math.drm.DrmLike
-import org.apache.mahout.math.drm.RLikeDrmOps._
-import org.apache.mahout.math.function.Functions.SQUARE
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import scala.language.higherKinds
-
-object AutocorrelationTests {
-
- //https://en.wikipedia.org/wiki/Durbin%E2%80%93Watson_statistic
- /*
- To test for positive autocorrelation at significance α, the test statistic d is compared to lower and upper critical values (dL,α and dU,α):
- If d < dL,α, there is statistical evidence that the error terms are positively autocorrelated.
- If d > dU,α, there is no statistical evidence that the error terms are positively autocorrelated.
- If dL,α < d < dU,α, the test is inconclusive.
-
- Rule of Thumb:
- d < 2 : positive auto-correlation
- d = 2 : no auto-correlation
- d > 2 : negative auto-correlation
- */
- def DurbinWatson[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]): R[K] = {
-
- val n = safeToNonNegInt(residuals.nrow)
- val e: DrmLike[K] = residuals(1 until n , 0 until 1)
- val e_t_1: DrmLike[K] = residuals(0 until n - 1, 0 until 1)
- val numerator = (e - e_t_1).assign(SQUARE).colSums()
- val denominator = residuals.assign(SQUARE).colSums()
- val dw = numerator / denominator
- model.testResults += ('durbinWatsonTestStatistic → dw.get(0))
- model.summary += s"\nDurbin Watson Test Statistic: ${dw.toString}"
- model
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
deleted file mode 100644
index c2d634b..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
+++ /dev/null
@@ -1,133 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms.regression.tests
-
-
-
-
-
-import org.apache.commons.math3.distribution.FDistribution
-import org.apache.mahout.math.algorithms.regression.RegressorModel
-import org.apache.mahout.math.algorithms.preprocessing.MeanCenter
-import org.apache.mahout.math.drm.DrmLike
-import org.apache.mahout.math.function.Functions.SQUARE
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.math.drm.RLikeDrmOps._
-
-import scala.language.higherKinds
-
-object FittnessTests {
-
- // https://en.wikipedia.org/wiki/Coefficient_of_determination
- def CoefficientOfDetermination[R[K] <: RegressorModel[K], K](model: R[K],
- drmTarget: DrmLike[K],
- residuals: DrmLike[K]): R[K] = {
- val sumSquareResiduals = residuals.assign(SQUARE).sum
- val mc = new MeanCenter()
- val totalResiduals = mc.fitTransform(drmTarget)
- val sumSquareTotal = totalResiduals.assign(SQUARE).sum
- val r2 = 1 - (sumSquareResiduals / sumSquareTotal)
- model.r2 = r2
- model.testResults += ('r2 -> r2) // need setResult and setSummary method incase you change in future, also to initialize map if non exists or update value if it does
- //model.summary += s"\nR^2: ${r2}"
- model
- }
-
- // https://en.wikipedia.org/wiki/Mean_squared_error
- def MeanSquareError[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]): R[K] = {
- // TODO : I think mse denom should be (row - col) ?? <-- https://en.wikipedia.org/wiki/Mean_squared_error see regression section
- val mse = residuals.assign(SQUARE).sum / residuals.nrow
- model.mse = mse
- model.testResults += ('mse -> mse)
- //model.summary += s"\nMean Squared Error: ${mse}"
- model
- }
-
- // Since rss is needed for multiple test statistics, use this function to cache this value
- def calculateResidualSumOfSquares[R[K] <: RegressorModel[K], K](model: R[K],residuals: DrmLike[K]) : R[K] = {
- // This is a check so that model.rss isnt unnecessarily computed
- // by default setting this value to negative, so that the first time its garaunteed to evaluate.
- if (model.rss < 0) {
- val ete = (residuals.t %*% residuals).collect // 1x1
- model.rss = ete(0, 0)
- }
- model
- }
-
-
- // https://en.wikipedia.org/wiki/F-test
- /*
- # R Prototype
- # Cereal Dataframe
- df1 <- data.frame(
- "X0" = c(1,1,1,1,1,1,1,1,1),
- "a" = c(2,1,1,2,1,2,6,3,3),
- "b" = c( 2,2,1,1,2,1,2,2,3),
- "c" = c( 10.5,12,12, 11,12, 16,17, 13,13),
- "d" = c( 10,12,13,13,11,8, 1, 7, 4),
- "target" = c( 29.509541,18.042851,22.736446,32.207582,21.871292,36.187559,50.764999,40.400208,45.811716))
-
- # Create linear regression models adding features one by one
- lrfit0 <- lm(data=df1, formula = target ~ 1 )
- lrfit1 <- lm(data=df1, formula = target ~ a )
- lrfit2 <- lm(data=df1, formula = target ~ a + b )
- lrfit3 <- lm(data=df1, formula = target ~ a + b + c )
- lrfit4 <- lm(data=df1, formula = target ~ a + b + c + d)
-
- ######################################
- # Fscore Calculation
- ######################################
-
- # So in the anova report using lm ...
- # These are the residual sum of squares for each model
- rssint <- sum(lrfit0$residuals^2)
- rssa <- sum(lrfit1$residuals^2)
- rssb <- sum(lrfit2$residuals^2)
- rssc <- sum(lrfit3$residuals^2)
- rssd <- sum(lrfit4$residuals^2)
-
- #Ftest in overall model
- (rssint - rssd)/4 / (rssd/4) # g = 4, n - g - 1 = 4
- # Compare with R
- summary(lrfit4)
-
- */
- def FTest[R[K] <: RegressorModel[K], K](model: R[K] , drmTarget: DrmLike[K]): R[K] = {
-
- val targetMean: Double = drmTarget.colMeans().get(0)
-
- // rssint is the Residual Sum of Squares for model using only based on the intercept
- val rssint: Double = ((drmTarget - targetMean ).t %*% (drmTarget - targetMean)).zSum()
- // K-1 is model.degreesOfFreedom-1
- // N-K is model.trainingExamples - model.degreesOfFreedom
-
- val fScore = ((rssint - model.rss) / (model.degreesOfFreedom-1) / ( model.rss / (model.trainingExamples - model.degreesOfFreedom)))
- val fDist = new FDistribution(model.degreesOfFreedom-1,model.trainingExamples-model.degreesOfFreedom)
- val fpval = 1.0 - fDist.cumulativeProbability(fScore)
- model.fpval = fpval
-
- model.fScore = fScore
- model.testResults += ('fScore -> fScore)
- //model.summary += s"\nFscore : ${fScore}"
- model
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/backend/Backend.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/backend/Backend.scala b/math-scala/src/main/scala/org/apache/mahout/math/backend/Backend.scala
deleted file mode 100644
index 9dfb7f2..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/backend/Backend.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-package org.apache.mahout.math.backend
-
-import org.apache.mahout.math.backend.jvm.JvmBackend
-
-import collection._
-import scala.reflect.{ClassTag, classTag}
-import jvm.JvmBackend
-
-/**
- * == Overview ==
- *
- * Backend representing collection of in-memory solvers or distributed operators.
- *
- * == Note to implementors ==
- *
- * Backend is expected to initialize & verify its own viability lazily either upon first time the
- * class is loaded, or upon the first invocation of any of its methods. After that, the value of
- * [[Backend.isAvailable]] must be cached and defined.
- *
- * A Backend is also a [[SolverFactory]] of course in a sense that it enumerates solvers made
- * available via the backend.
- */
-trait Backend extends SolverFactory {
-
- /**
- * If backend has loaded (lazily) ok and verified its availability/functionality,
- * this must return `true`.
- *
- * @return `true`
- */
- def isAvailable: Boolean
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala b/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala
deleted file mode 100644
index 0904ea5..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/backend/RootSolverFactory.scala
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.math.backend
-
-import org.apache.mahout.logging._
-import org.apache.mahout.math.backend.jvm.JvmBackend
-import org.apache.mahout.math.scalabindings.{MMBinaryFunc, MMul, _}
-
-import scala.collection._
-import scala.reflect.{ClassTag, classTag}
-
-
-final object RootSolverFactory extends SolverFactory {
-
- import org.apache.mahout.math.backend.incore._
-
- private implicit val logger = getLog(RootSolverFactory.getClass)
-
- private val solverTagsToScan =
- classTag[MMulSolver] ::
- classTag[MMulSparseSolver] ::
- classTag[MMulDenseSolver] ::
- Nil
-
- private val defaultBackendPriority =
- JvmBackend.getClass.getName :: Nil
-
- private def initBackends(): Unit = {
-
- }
-
- // TODO: MAHOUT-1909: Cache Modular Backend solvers after probing
- // That is, lazily initialize the map, query backends, and build resolution rules.
- override protected[backend] val solverMap = new mutable.HashMap[ClassTag[_], Any]()
-
- validateMap()
-
- // Default solver is JVM
- var clazz: MMBinaryFunc = MMul
-
- // TODO: Match on implicit Classtag
-
- def getOperator[C: ClassTag]: MMBinaryFunc = {
-
- try {
- logger.info("Creating org.apache.mahout.viennacl.opencl.GPUMMul solver")
- clazz = Class.forName("org.apache.mahout.viennacl.opencl.GPUMMul$").getField("MODULE$").get(null).asInstanceOf[MMBinaryFunc]
- logger.info("Successfully created org.apache.mahout.viennacl.opencl.GPUMMul solver")
-
- } catch {
- case x: Exception =>
- logger.info("Unable to create class GPUMMul: attempting OpenMP version")
- try {
- // Attempt to instantiate the OpenMP version, assuming we’ve
- // created a separate OpenMP-only module (none exist yet)
- logger.info("Creating org.apache.mahout.viennacl.openmp.OMPMMul solver")
- clazz = Class.forName("org.apache.mahout.viennacl.openmp.OMPMMul$").getField("MODULE$").get(null).asInstanceOf[MMBinaryFunc]
- logger.info("Successfully created org.apache.mahout.viennacl.openmp.OMPMMul solver")
-
- } catch {
- case xx: Exception =>
- logger.info(xx.getMessage)
- // Fall back to JVM; don't need to dynamically assign since MMul is in the same package.
- logger.info("Unable to create class OMPMMul: falling back to java version")
- clazz = MMul
- }
- }
- clazz
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/backend/SolverFactory.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/backend/SolverFactory.scala b/math-scala/src/main/scala/org/apache/mahout/math/backend/SolverFactory.scala
deleted file mode 100644
index 756b971..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/backend/SolverFactory.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-package org.apache.mahout.math.backend
-
-import scala.collection.{Iterable, Map}
-import scala.reflect.{ClassTag, classTag}
-
-/**
- * == Overview ==
- *
- * Solver factory is an essence a collection of lazily initialized strategy singletons solving some
- * (any) problem in context of the Mahout project.
- *
- * We intend to use it _mainly_ for problems that are super-linear problems, and often involve more
- * than one argument (operand).
- *
- * The main method to probe for an available solver is [[RootSolverFactory.getSolver]].
- */
-trait SolverFactory {
- /**
- * We take an implicit context binding, the classTag, of the trait of the solver desired.
- *
- * == Note to callers ==
- *
- * Due to Scala semantics, it is usually not enough to request a solver via merely {{{
- * val s:SolverType = backend.getSolver
- * }}} but instead requires an explicit solver tag, i.e.: {{{
- * val s = backend.getSolver[SolverType]
- * }}}
- *
- *
- */
- def getSolver[S: ClassTag]: Option[S] = {
- solverMap.get(classTag[S]).flatMap {
- _ match {
- case s: S ⇒ Some(s)
- case _ ⇒ None
- }
- }
- }
-
- lazy val availableSolverTags: Iterable[ClassTag[_]] = solverMap.keySet
-
-
-
- protected[backend] val solverMap: Map[ClassTag[_], Any]
-
- protected[backend] def validateMap(): Unit = {
-
- for ((tag, instance) ← solverMap) {
- require(tag.runtimeClass.isAssignableFrom(instance.getClass),
- s"Solver implementation class `${instance.getClass.getName}` is not a subclass of solver trait `${tag}`.")
-
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/backend/incore/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/backend/incore/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/backend/incore/package.scala
deleted file mode 100644
index 1bb4480..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/backend/incore/package.scala
+++ /dev/null
@@ -1,17 +0,0 @@
-package org.apache.mahout.math.backend
-
-import org.apache.mahout.math.scalabindings.{MMBinaryFunc, MMUnaryFunc}
-
-package object incore {
-
- trait MMulSolver extends MMBinaryFunc
- trait MMulDenseSolver extends MMulSolver
- trait MMulSparseSolver extends MMulSolver
- trait AAtSolver extends MMUnaryFunc
- trait AAtDenseSolver extends AAtSolver
- trait AAtSparseSolver extends AAtSolver
- trait AtASolver extends MMUnaryFunc
- trait AtADenseSolver extends AtASolver
- trait AtASparseSolver extends AtASolver
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/backend/jvm/JvmBackend.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/backend/jvm/JvmBackend.scala b/math-scala/src/main/scala/org/apache/mahout/math/backend/jvm/JvmBackend.scala
deleted file mode 100644
index 6588243..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/backend/jvm/JvmBackend.scala
+++ /dev/null
@@ -1,51 +0,0 @@
-package org.apache.mahout.math.backend.jvm
-
-import org.apache.mahout.math._
-import scalabindings._
-import RLikeOps._
-import org.apache.mahout.math.backend.Backend
-import org.apache.mahout.math.scalabindings.MMul
-
-import scala.collection.Map
-import scala.reflect._
-
-object JvmBackend extends Backend {
-
- import org.apache.mahout.math.backend.incore._
-
- /**
- * If backend has loaded (lazily) ok and verified its availability/functionality,
- * this must return `true`.
- *
- * @return `true`
- */
- override def isAvailable: Boolean = true
-
- // TODO: In a future release, Refactor MMul optimizations into this object
- override protected[backend] val solverMap: Map[ClassTag[_], Any] = Map(
- classTag[MMulSolver] → MMul
- // classTag[MMulDenseSolver] → MMul,
- // classTag[MMulSparseSolver] → MMul,
- // classTag[AtASolver] → new AtASolver {
- // override def apply(a: Matrix, r: Option[Matrix]): Matrix = MMul(a.t, a, r)
- // }// ,
- // classTag[AtADenseSolver] → { (a: Matrix, r: Option[Matrix]) ⇒ MMul(a.t, a, r) },
- // classTag[AtASparseSolver] → { (a: Matrix, r: Option[Matrix]) ⇒ MMul(a.t, a, r) },
- // classTag[AAtSolver] → { (a: Matrix, r: Option[Matrix]) ⇒ MMul(a, a.t, r) },
- // classTag[AAtDenseSolver] → { (a: Matrix, r: Option[Matrix]) ⇒ MMul(a, a.t, r) },
- // classTag[AAtSparseSolver] → { (a: Matrix, r: Option[Matrix]) ⇒ MMul(a, a.t, r) }
- )
- validateMap()
-
- private val mmulSolver = new MMulSolver with MMulDenseSolver with MMulSparseSolver {
- override def apply(a: Matrix, b: Matrix, r: Option[Matrix]): Matrix = MMul(a, b, r)
- }
-
- private val ataSolver = new AtASolver with AtADenseSolver with AtASparseSolver {
- override def apply(a: Matrix, r: Option[Matrix]): Matrix = MMul(a.t, a, r)
- }
-
- private val aatSolver = new AAtSolver {
- override def apply(a: Matrix, r: Option[Matrix]): Matrix = MMul(a, a.t, r)
- }
-}

r***@apache.org

2018-06-27 14:51:51 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala
deleted file mode 100644
index c4afe4f..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.classifier.naivebayes
-
-import org.apache.mahout.math._
-
-import org.apache.mahout.math.{drm, scalabindings}
-
-import scalabindings._
-import scalabindings.RLikeOps._
-import drm._
-import scala.language.asInstanceOf
-import scala.collection._
-import JavaConversions._
-
-/**
- *
- * @param weightsPerLabelAndFeature Aggregated matrix of weights of labels x features
- * @param weightsPerFeature Vector of summation of all feature weights.
- * @param weightsPerLabel Vector of summation of all label weights.
- * @param perlabelThetaNormalizer Vector of weight normalizers per label (used only for complemtary models)
- * @param labelIndex HashMap of labels and their corresponding row in the weightMatrix
- * @param alphaI Laplace smoothing factor.
- * @param isComplementary Whether or not this is a complementary model.
- */
-class NBModel(val weightsPerLabelAndFeature: Matrix = null,
- val weightsPerFeature: Vector = null,
- val weightsPerLabel: Vector = null,
- val perlabelThetaNormalizer: Vector = null,
- val labelIndex: Map[String, Integer] = null,
- val alphaI: Float = 1.0f,
- val isComplementary: Boolean= false) extends java.io.Serializable {
-
-
- val numFeatures: Double = weightsPerFeature.getNumNondefaultElements
- val totalWeightSum: Double = weightsPerLabel.zSum
- val alphaVector: Vector = null
-
- validate()
-
- // todo: Maybe it is a good idea to move the dfsWrite and dfsRead out
- // todo: of the model and into a helper
-
- // TODO: weightsPerLabelAndFeature, a sparse (numFeatures x numLabels) matrix should fit
- // TODO: upfront in memory and should not require a DRM decide if we want this to scale out.
-
-
- /** getter for summed label weights. Used by legacy classifier */
- def labelWeight(label: Int): Double = {
- weightsPerLabel.getQuick(label)
- }
-
- /** getter for weight normalizers. Used by legacy classifier */
- def thetaNormalizer(label: Int): Double = {
- perlabelThetaNormalizer.get(label)
- }
-
- /** getter for summed feature weights. Used by legacy classifier */
- def featureWeight(feature: Int): Double = {
- weightsPerFeature.getQuick(feature)
- }
-
- /** getter for individual aggregated weights. Used by legacy classifier */
- def weight(label: Int, feature: Int): Double = {
- weightsPerLabelAndFeature.getQuick(label, feature)
- }
-
- /** getter for a single empty vector of weights */
- def createScoringVector: Vector = {
- weightsPerLabel.like
- }
-
- /** getter for a the number of labels to consider */
- def numLabels: Int = {
- weightsPerLabel.size
- }
-
- /**
- * Write a trained model to the filesystem as a series of DRMs
- * @param pathToModel Directory to which the model will be written
- */
- def dfsWrite(pathToModel: String)(implicit ctx: DistributedContext): Unit = {
- //todo: write out as smaller partitions or possibly use reader and writers to
- //todo: write something other than a DRM for label Index, is Complementary, alphaI.
-
- // add a directory to put all of the DRMs in
- val fullPathToModel = pathToModel + NBModel.modelBaseDirectory
-
- drmParallelize(weightsPerLabelAndFeature).dfsWrite(fullPathToModel + "/weightsPerLabelAndFeatureDrm.drm")
- drmParallelize(sparse(weightsPerFeature)).dfsWrite(fullPathToModel + "/weightsPerFeatureDrm.drm")
- drmParallelize(sparse(weightsPerLabel)).dfsWrite(fullPathToModel + "/weightsPerLabelDrm.drm")
- drmParallelize(sparse(perlabelThetaNormalizer)).dfsWrite(fullPathToModel + "/perlabelThetaNormalizerDrm.drm")
- drmParallelize(sparse(svec((0,alphaI)::Nil))).dfsWrite(fullPathToModel + "/alphaIDrm.drm")
-
- // isComplementry is true if isComplementaryDrm(0,0) == 1 else false
- val isComplementaryDrm = sparse(0 to 1, 0 to 1)
- if(isComplementary){
- isComplementaryDrm(0,0) = 1.0
- } else {
- isComplementaryDrm(0,0) = 0.0
- }
- drmParallelize(isComplementaryDrm).dfsWrite(fullPathToModel + "/isComplementaryDrm.drm")
-
- // write the label index as a String-Keyed DRM.
- val labelIndexDummyDrm = weightsPerLabelAndFeature.like()
- labelIndexDummyDrm.setRowLabelBindings(labelIndex)
- // get a reverse map of [Integer, String] and set the value of firsr column of the drm
- // to the corresponding row number for it's Label (the rows may not be read back in the same order)
- val revMap = labelIndex.map(x => x._2 -> x._1)
- for(i <- 0 until labelIndexDummyDrm.numRows() ){
- labelIndexDummyDrm.set(labelIndex(revMap(i)), 0, i.toDouble)
- }
-
- drmParallelizeWithRowLabels(labelIndexDummyDrm).dfsWrite(fullPathToModel + "/labelIndex.drm")
- }
-
- /** Model Validation */
- def validate() {
- assert(alphaI > 0, "alphaI has to be greater than 0!")
- assert(numFeatures > 0, "the vocab count has to be greater than 0!")
- assert(totalWeightSum > 0, "the totalWeightSum has to be greater than 0!")
- assert(weightsPerLabel != null, "the number of labels has to be defined!")
- assert(weightsPerLabel.getNumNondefaultElements > 0, "the number of labels has to be greater than 0!")
- assert(weightsPerFeature != null, "the feature sums have to be defined")
- assert(weightsPerFeature.getNumNondefaultElements > 0, "the feature sums have to be greater than 0!")
- if (isComplementary) {
- assert(perlabelThetaNormalizer != null, "the theta normalizers have to be defined")
- assert(perlabelThetaNormalizer.getNumNondefaultElements > 0, "the number of theta normalizers has to be greater than 0!")
- assert(Math.signum(perlabelThetaNormalizer.minValue) == Math.signum(perlabelThetaNormalizer.maxValue), "Theta normalizers do not all have the same sign")
- assert(perlabelThetaNormalizer.getNumNonZeroElements == perlabelThetaNormalizer.size, "Weight normalizers can not have zero value.")
- }
- assert(labelIndex.size == weightsPerLabel.getNumNondefaultElements, "label index must have entries for all labels")
- }
-}
-
-object NBModel extends java.io.Serializable {
-
- val modelBaseDirectory = "/naiveBayesModel"
-
- /**
- * Read a trained model in from from the filesystem.
- * @param pathToModel directory from which to read individual model components
- * @return a valid NBModel
- */
- def dfsRead(pathToModel: String)(implicit ctx: DistributedContext): NBModel = {
- //todo: Takes forever to read we need a more practical method of writing models. Readers/Writers?
-
- // read from a base directory for all drms
- val fullPathToModel = pathToModel + modelBaseDirectory
-
- val weightsPerFeatureDrm = drmDfsRead(fullPathToModel + "/weightsPerFeatureDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val weightsPerFeature = weightsPerFeatureDrm.collect(0, ::)
- weightsPerFeatureDrm.uncache()
-
- val weightsPerLabelDrm = drmDfsRead(fullPathToModel + "/weightsPerLabelDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val weightsPerLabel = weightsPerLabelDrm.collect(0, ::)
- weightsPerLabelDrm.uncache()
-
- val alphaIDrm = drmDfsRead(fullPathToModel + "/alphaIDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val alphaI: Float = alphaIDrm.collect(0, 0).toFloat
- alphaIDrm.uncache()
-
- // isComplementry is true if isComplementaryDrm(0,0) == 1 else false
- val isComplementaryDrm = drmDfsRead(fullPathToModel + "/isComplementaryDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val isComplementary = isComplementaryDrm.collect(0, 0).toInt == 1
- isComplementaryDrm.uncache()
-
- var perLabelThetaNormalizer= weightsPerFeature.like()
- if (isComplementary) {
- val perLabelThetaNormalizerDrm = drm.drmDfsRead(fullPathToModel + "/perlabelThetaNormalizerDrm.drm")
- .checkpoint(CacheHint.MEMORY_ONLY)
- perLabelThetaNormalizer = perLabelThetaNormalizerDrm.collect(0, ::)
- }
-
- val dummyLabelDrm= drmDfsRead(fullPathToModel + "/labelIndex.drm")
- .checkpoint(CacheHint.MEMORY_ONLY)
- val labelIndexMap:java.util.Map[String, Integer] = dummyLabelDrm.getRowLabelBindings
- dummyLabelDrm.uncache()
-
- // map the labels to the corresponding row numbers of weightsPerFeatureDrm (values in dummyLabelDrm)
- val scalaLabelIndexMap: mutable.Map[String, Integer] =
- labelIndexMap.map(x => x._1 -> dummyLabelDrm.get(labelIndexMap(x._1), 0)
- .toInt
- .asInstanceOf[Integer])
-
- val weightsPerLabelAndFeatureDrm = drmDfsRead(fullPathToModel + "/weightsPerLabelAndFeatureDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val weightsPerLabelAndFeature = weightsPerLabelAndFeatureDrm.collect
- weightsPerLabelAndFeatureDrm.uncache()
-
- // model validation is triggered automatically by constructor
- val model: NBModel = new NBModel(weightsPerLabelAndFeature,
- weightsPerFeature,
- weightsPerLabel,
- perLabelThetaNormalizer,
- scalaLabelIndexMap,
- alphaI,
- isComplementary)
-
- model
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala
deleted file mode 100644
index 36fc551..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.naivebayes
-
-import org.apache.mahout.classifier.stats.{ResultAnalyzer, ClassifierResult}
-import org.apache.mahout.math._
-import scalabindings._
-import scalabindings.RLikeOps._
-import drm.RLikeDrmOps._
-import drm._
-import scala.reflect.ClassTag
-import scala.language.asInstanceOf
-import collection._
-import scala.collection.JavaConversions._
-
-/**
- * Distributed training of a Naive Bayes model. Follows the approach presented in Rennie et.al.: Tackling the poor
- * assumptions of Naive Bayes Text classifiers, ICML 2003, http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
- */
-trait NaiveBayes extends java.io.Serializable{
-
- /** default value for the Laplacian smoothing parameter */
- def defaultAlphaI = 1.0f
-
- // function to extract categories from string keys
- type CategoryParser = String => String
-
- /** Default: seqdirectory/seq2Sparse Categories are Stored in Drm Keys as: /Category/document_id */
- def seq2SparseCategoryParser: CategoryParser = x => x.split("/")(1)
-
-
- /**
- * Distributed training of a Naive Bayes model. Follows the approach presented in Rennie et.al.: Tackling the poor
- * assumptions of Naive Bayes Text classifiers, ICML 2003, http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
- *
- * @param observationsPerLabel a DrmLike[Int] matrix containing term frequency counts for each label.
- * @param trainComplementary whether or not to train a complementary Naive Bayes model
- * @param alphaI Laplace smoothing parameter
- * @return trained naive bayes model
- */
- def train(observationsPerLabel: DrmLike[Int],
- labelIndex: Map[String, Integer],
- trainComplementary: Boolean = true,
- alphaI: Float = defaultAlphaI): NBModel = {
-
- // Summation of all weights per feature
- val weightsPerFeature = observationsPerLabel.colSums
-
- // Distributed summation of all weights per label
- val weightsPerLabel = observationsPerLabel.rowSums
-
- // Collect a matrix to pass to the NaiveBayesModel
- val inCoreTFIDF = observationsPerLabel.collect
-
- // perLabelThetaNormalizer Vector is expected by NaiveBayesModel. We can pass a null value
- // or Vector of zeroes in the case of a standard NB model.
- var thetaNormalizer = weightsPerFeature.like()
-
- // Instantiate a trainer and retrieve the perLabelThetaNormalizer Vector from it in the case of
- // a complementary NB model
- if (trainComplementary) {
- val thetaTrainer = new ComplementaryNBThetaTrainer(weightsPerFeature,
- weightsPerLabel,
- alphaI)
- // local training of the theta normalization
- for (labelIndex <- 0 until inCoreTFIDF.nrow) {
- thetaTrainer.train(labelIndex, inCoreTFIDF(labelIndex, ::))
- }
- thetaNormalizer = thetaTrainer.retrievePerLabelThetaNormalizer
- }
-
- new NBModel(inCoreTFIDF,
- weightsPerFeature,
- weightsPerLabel,
- thetaNormalizer,
- labelIndex,
- alphaI,
- trainComplementary)
- }
-
- /**
- * Extract label Keys from raw TF or TF-IDF Matrix generated by seqdirectory/seq2sparse
- * and aggregate TF or TF-IDF values by their label
- * Override this method in engine specific modules to optimize
- *
- * @param stringKeyedObservations DrmLike matrix; Output from seq2sparse
- * in form K = eg./Category/document_title
- * V = TF or TF-IDF values per term
- * @param cParser a String => String function used to extract categories from
- * Keys of the stringKeyedObservations DRM. The default
- * CategoryParser will extract "Category" from: '/Category/document_id'
- * @return (labelIndexMap,aggregatedByLabelObservationDrm)
- * labelIndexMap is a HashMap [String, Integer] K = label row index
- * V = label
- * aggregatedByLabelObservationDrm is a DrmLike[Int] of aggregated
- * TF or TF-IDF counts per label
- */
- def extractLabelsAndAggregateObservations[K](stringKeyedObservations: DrmLike[K],
- cParser: CategoryParser = seq2SparseCategoryParser)
- (implicit ctx: DistributedContext):
- (mutable.HashMap[String, Integer], DrmLike[Int])= {
-
- stringKeyedObservations.checkpoint()
-
- val numDocs=stringKeyedObservations.nrow
- val numFeatures=stringKeyedObservations.ncol
-
- // For mapblocks that return K.
- implicit val ktag = stringKeyedObservations.keyClassTag
-
- // Extract categories from labels assigned by seq2sparse
- // Categories are Stored in Drm Keys as eg.: /Category/document_id
-
- // Get a new DRM with a single column so that we don't have to collect the
- // DRM into memory upfront.
- val strippedObeservations = stringKeyedObservations.mapBlock(ncol = 1) {
- case (keys, block) =>
- val blockB = block.like(keys.size, 1)
- keys -> blockB
- }
-
- // Extract the row label bindings (the String keys) from the slim Drm
- // strip the document_id from the row keys keeping only the category.
- // Sort the bindings alphabetically into a Vector
- val labelVectorByRowIndex = strippedObeservations
- .getRowLabelBindings
- .map(x => x._2 -> cParser(x._1))
- .toVector.sortWith(_._1 < _._1)
-
- //TODO: add a .toIntKeyed(...) method to DrmLike?
-
- // Copy stringKeyedObservations to an Int-Keyed Drm so that we can compute transpose
- // Copy the Collected Matrices up front for now until we hav a distributed way of converting
- val inCoreStringKeyedObservations = stringKeyedObservations.collect
- val inCoreIntKeyedObservations = new SparseMatrix(
- stringKeyedObservations.nrow.toInt,
- stringKeyedObservations.ncol)
- for (i <- 0 until inCoreStringKeyedObservations.nrow) {
- inCoreIntKeyedObservations(i, ::) = inCoreStringKeyedObservations(i, ::)
- }
-
- val intKeyedObservations= drmParallelize(inCoreIntKeyedObservations)
-
- stringKeyedObservations.uncache()
-
- var labelIndex = 0
- val labelIndexMap = new mutable.HashMap[String, Integer]
- val encodedLabelByRowIndexVector = new DenseVector(labelVectorByRowIndex.size)
-
- // Encode Categories as an Integer (Double) so we can broadcast as a vector
- // where each element is an Int-encoded category whose index corresponds
- // to its row in the Drm
- for (i <- labelVectorByRowIndex.indices) {
- if (!labelIndexMap.contains(labelVectorByRowIndex(i)._2)) {
- encodedLabelByRowIndexVector(i) = labelIndex.toDouble
- labelIndexMap.put(labelVectorByRowIndex(i)._2, labelIndex)
- labelIndex += 1
- }
- // don't like this casting but need to use a java.lang.Integer when setting rowLabelBindings
- encodedLabelByRowIndexVector(i) = labelIndexMap
- .getOrElse(labelVectorByRowIndex(i)._2, -1)
- .asInstanceOf[Int].toDouble
- }
-
- // "Combiner": Map and aggregate by Category. Do this by broadcasting the encoded
- // category vector and mapping a transposed IntKeyed Drm out so that all categories
- // will be present on all nodes as columns and can be referenced by
- // BCastEncodedCategoryByRowVector. Iteratively sum all categories.
- val nLabels = labelIndex
-
- val bcastEncodedCategoryByRowVector = drmBroadcast(encodedLabelByRowIndexVector)
-
- val aggregetedObservationByLabelDrm = intKeyedObservations.t.mapBlock(ncol = nLabels) {
- case (keys, blockA) =>
- val blockB = blockA.like(keys.size, nLabels)
- var label : Int = 0
- for (i <- 0 until keys.size) {
- blockA(i, ::).nonZeroes().foreach { elem =>
- label = bcastEncodedCategoryByRowVector.get(elem.index).toInt
- blockB(i, label) = blockB(i, label) + blockA(i, elem.index)
- }
- }
- keys -> blockB
- }.t
-
- (labelIndexMap, aggregetedObservationByLabelDrm)
- }
-
- /**
- * Test a trained model with a labeled dataset sequentially
- * @param model a trained NBModel
- * @param testSet a labeled testing set
- * @param testComplementary test using a complementary or a standard NB classifier
- * @param cParser a String => String function used to extract categories from
- * Keys of the testing set DRM. The default
- * CategoryParser will extract "Category" from: '/Category/document_id'
- *
- * *Note*: this method brings the entire test set into upfront memory,
- * This method is optimized and parallelized in SparkNaiveBayes
- *
- * @tparam K implicitly determined Key type of test set DRM: String
- * @return a result analyzer with confusion matrix and accuracy statistics
- */
- def test[K: ClassTag](model: NBModel,
- testSet: DrmLike[K],
- testComplementary: Boolean = false,
- cParser: CategoryParser = seq2SparseCategoryParser)
- (implicit ctx: DistributedContext): ResultAnalyzer = {
-
- val labelMap = model.labelIndex
-
- val numLabels = model.numLabels
-
- testSet.checkpoint()
-
- val numTestInstances = testSet.nrow.toInt
-
- // instantiate the correct type of classifier
- val classifier = testComplementary match {
- case true => new ComplementaryNBClassifier(model) with Serializable
- case _ => new StandardNBClassifier(model) with Serializable
- }
-
- if (testComplementary) {
- assert(testComplementary == model.isComplementary,
- "Complementary Label Assignment requires Complementary Training")
- }
-
-
- // Sequentially assign labels to the test set:
- // *Note* this brings the entire test set into memory upfront:
-
- // Since we cant broadcast the model as is do it sequentially up front for now
- val inCoreTestSet = testSet.collect
-
- // get the labels of the test set and extract the keys
- val testSetLabelMap = testSet.getRowLabelBindings
-
- // empty Matrix in which we'll set the classification scores
- val inCoreScoredTestSet = testSet.like(numTestInstances, numLabels)
-
- testSet.uncache()
-
- for (i <- 0 until numTestInstances) {
- inCoreScoredTestSet(i, ::) := classifier.classifyFull(inCoreTestSet(i, ::))
- }
-
- // todo: reverse the labelMaps in training and through the model?
-
- // reverse the label map and extract the labels
- val reverseTestSetLabelMap = testSetLabelMap.map(x => x._2 -> cParser(x._1))
-
- val reverseLabelMap = labelMap.map(x => x._2 -> x._1)
-
- val analyzer = new ResultAnalyzer(labelMap.keys.toList.sorted, "DEFAULT")
-
- // assign labels- winner takes all
- for (i <- 0 until numTestInstances) {
- val (bestIdx, bestScore) = argmax(inCoreScoredTestSet(i, ::))
- val classifierResult = new ClassifierResult(reverseLabelMap(bestIdx), bestScore)
- analyzer.addInstance(reverseTestSetLabelMap(i), classifierResult)
- }
-
- analyzer
- }
-
- /**
- * argmax with values as well
- * returns a tuple of index of the max score and the score itself.
- * @param v Vector of of scores
- * @return (bestIndex, bestScore)
- */
- def argmax(v: Vector): (Int, Double) = {
- var bestIdx: Int = Integer.MIN_VALUE
- var bestScore: Double = Integer.MIN_VALUE.toDouble
- for(i <- 0 until v.size) {
- if(v(i) > bestScore){
- bestScore = v(i)
- bestIdx = i
- }
- }
- (bestIdx, bestScore)
- }
-
-}
-
-object NaiveBayes extends NaiveBayes with java.io.Serializable
-
-/**
- * Trainer for the weight normalization vector used by Transform Weight Normalized Complement
- * Naive Bayes. See: Rennie et.al.: Tackling the poor assumptions of Naive Bayes Text classifiers,
- * ICML 2003, http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf Sec. 3.2.
- *
- * @param weightsPerFeature a Vector of summed TF or TF-IDF weights for each word in dictionary.
- * @param weightsPerLabel a Vector of summed TF or TF-IDF weights for each label.
- * @param alphaI Laplace smoothing factor. Defaut value of 1.
- */
-class ComplementaryNBThetaTrainer(private val weightsPerFeature: Vector,
- private val weightsPerLabel: Vector,
- private val alphaI: Double = 1.0) {
-
- private val perLabelThetaNormalizer: Vector = weightsPerLabel.like()
- private val totalWeightSum: Double = weightsPerLabel.zSum
- private val numFeatures: Double = weightsPerFeature.getNumNondefaultElements
-
- assert(weightsPerFeature != null, "weightsPerFeature vector can not be null")
- assert(weightsPerLabel != null, "weightsPerLabel vector can not be null")
-
- /**
- * Train the weight normalization vector for each label
- * @param label
- * @param featurePerLabelWeight
- */
- def train(label: Int, featurePerLabelWeight: Vector) {
- val currentLabelWeight = labelWeight(label)
- // sum weights for each label including those with zero word counts
- for (i <- 0 until featurePerLabelWeight.size) {
- val currentFeaturePerLabelWeight = featurePerLabelWeight(i)
- updatePerLabelThetaNormalizer(label,
- ComplementaryNBClassifier.computeWeight(featureWeight(i),
- currentFeaturePerLabelWeight,
- totalWeightSum,
- currentLabelWeight,
- alphaI,
- numFeatures)
- )
- }
- }
-
- /**
- * getter for summed TF or TF-IDF weights by label
- * @param label index of label
- * @return sum of word TF or TF-IDF weights for label
- */
- def labelWeight(label: Int): Double = {
- weightsPerLabel(label)
- }
-
- /**
- * getter for summed TF or TF-IDF weights by word.
- * @param feature index of word.
- * @return sum of TF or TF-IDF weights for word.
- */
- def featureWeight(feature: Int): Double = {
- weightsPerFeature(feature)
- }
-
- /**
- * add the magnitude of the current weight to the current
- * label's corresponding Vector element.
- * @param label index of label to update.
- * @param weight weight to add.
- */
- def updatePerLabelThetaNormalizer(label: Int, weight: Double) {
- perLabelThetaNormalizer(label) = perLabelThetaNormalizer(label) + Math.abs(weight)
- }
-
- /**
- * Getter for the weight normalizer vector as indexed by label
- * @return a copy of the weight normalizer vector.
- */
- def retrievePerLabelThetaNormalizer: Vector = {
- perLabelThetaNormalizer.cloned
- }
-
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala
deleted file mode 100644
index 8f1413a..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package org.apache.mahout.classifier.stats
-
-import java.text.{DecimalFormat, NumberFormat}
-import java.util
-import org.apache.mahout.math.stats.OnlineSummarizer
-
-
-/**
- * Result of a document classification. The label and the associated score (usually probabilty)
- */
-class ClassifierResult (private var label: String = null,
- private var score: Double = 0.0,
- private var logLikelihood: Double = Integer.MAX_VALUE.toDouble) {
-
- def getLogLikelihood: Double = logLikelihood
-
- def setLogLikelihood(llh: Double) {
- logLikelihood = llh
- }
-
- def getLabel: String = label
-
- def getScore: Double = score
-
- def setLabel(lbl: String) {
- label = lbl
- }
-
- def setScore(sc: Double) {
- score = sc
- }
-
- override def toString: String = {
- "ClassifierResult{" + "category='" + label + '\'' + ", score=" + score + '}'
- }
-
-}
-
-/**
- * ResultAnalyzer captures the classification statistics and displays in a tabular manner
- * @param labelSet Set of labels to be considered in classification
- * @param defaultLabel the default label for an unknown classification
- */
-class ResultAnalyzer(private val labelSet: util.Collection[String], defaultLabel: String) {
-
- val confusionMatrix = new ConfusionMatrix(labelSet, defaultLabel)
- val summarizer = new OnlineSummarizer
-
- private var hasLL: Boolean = false
- private var correctlyClassified: Int = 0
- private var incorrectlyClassified: Int = 0
-
-
- def getConfusionMatrix: ConfusionMatrix = confusionMatrix
-
- /**
- *
- * @param correctLabel
- * The correct label
- * @param classifiedResult
- * The classified result
- * @return whether the instance was correct or not
- */
- def addInstance(correctLabel: String, classifiedResult: ClassifierResult): Boolean = {
- val result: Boolean = correctLabel == classifiedResult.getLabel
- if (result) {
- correctlyClassified += 1
- }
- else {
- incorrectlyClassified += 1
- }
- confusionMatrix.addInstance(correctLabel, classifiedResult)
- if (classifiedResult.getLogLikelihood != Integer.MAX_VALUE.toDouble) {
- summarizer.add(classifiedResult.getLogLikelihood)
- hasLL = true
- }
-
- result
- }
-
- /** Dump the resulting statistics to a string */
- override def toString: String = {
- val returnString: StringBuilder = new StringBuilder
- returnString.append('\n')
- returnString.append("=======================================================\n")
- returnString.append("Summary\n")
- returnString.append("-------------------------------------------------------\n")
- val totalClassified: Int = correctlyClassified + incorrectlyClassified
- val percentageCorrect: Double = 100.asInstanceOf[Double] * correctlyClassified / totalClassified
- val percentageIncorrect: Double = 100.asInstanceOf[Double] * incorrectlyClassified / totalClassified
- val decimalFormatter: NumberFormat = new DecimalFormat("0.####")
- returnString.append("Correctly Classified Instances")
- .append(": ")
- .append(Integer.toString(correctlyClassified))
- .append('\t')
- .append(decimalFormatter.format(percentageCorrect))
- .append("%\n")
- returnString.append("Incorrectly Classified Instances")
- .append(": ")
- .append(Integer.toString(incorrectlyClassified))
- .append('\t')
- .append(decimalFormatter.format(percentageIncorrect))
- .append("%\n")
- returnString.append("Total Classified Instances")
- .append(": ")
- .append(Integer.toString(totalClassified))
- .append('\n')
- returnString.append('\n')
- returnString.append(confusionMatrix)
- returnString.append("=======================================================\n")
- returnString.append("Statistics\n")
- returnString.append("-------------------------------------------------------\n")
- val normStats: RunningAverageAndStdDev = confusionMatrix.getNormalizedStats
- returnString.append("Kappa: \t")
- .append(decimalFormatter.format(confusionMatrix.getKappa))
- .append('\n')
- returnString.append("Accuracy: \t")
- .append(decimalFormatter.format(confusionMatrix.getAccuracy))
- .append("%\n")
- returnString.append("Reliability: \t")
- .append(decimalFormatter.format(normStats.getAverage * 100.00000001))
- .append("%\n")
- returnString.append("Reliability (std dev): \t")
- .append(decimalFormatter.format(normStats.getStandardDeviation))
- .append('\n')
- returnString.append("Weighted precision: \t")
- .append(decimalFormatter.format(confusionMatrix.getWeightedPrecision))
- .append('\n')
- returnString.append("Weighted recall: \t")
- .append(decimalFormatter.format(confusionMatrix.getWeightedRecall))
- .append('\n')
- returnString.append("Weighted F1 score: \t")
- .append(decimalFormatter.format(confusionMatrix.getWeightedF1score))
- .append('\n')
- if (hasLL) {
- returnString.append("Log-likelihood: \t")
- .append("mean : \t")
- .append(decimalFormatter.format(summarizer.getMean))
- .append('\n')
- returnString.append("25%-ile : \t")
- .append(decimalFormatter.format(summarizer.getQuartile(1)))
- .append('\n')
- returnString.append("75%-ile : \t")
- .append(decimalFormatter.format(summarizer.getQuartile(3)))
- .append('\n')
- }
-
- returnString.toString()
- }
-
-
-}
-
-/**
- *
- * Interface for classes that can keep track of a running average of a series of numbers. One can add to or
- * remove from the series, as well as update a datum in the series. The class does not actually keep track of
- * the series of values, just its running average, so it doesn't even matter if you remove/change a value that
- * wasn't added.
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.RunningAverage.java
- */
-trait RunningAverage {
-
- /**
- * @param datum
- * new item to add to the running average
- * @throws IllegalArgumentException
- * if datum is { @link Double#NaN}
- */
- def addDatum(datum: Double)
-
- /**
- * @param datum
- * item to remove to the running average
- * @throws IllegalArgumentException
- * if datum is { @link Double#NaN}
- * @throws IllegalStateException
- * if count is 0
- */
- def removeDatum(datum: Double)
-
- /**
- * @param delta
- * amount by which to change a datum in the running average
- * @throws IllegalArgumentException
- * if delta is { @link Double#NaN}
- * @throws IllegalStateException
- * if count is 0
- */
- def changeDatum(delta: Double)
-
- def getCount: Int
-
- def getAverage: Double
-
- /**
- * @return a (possibly immutable) object whose average is the negative of this object's
- */
- def inverse: RunningAverage
-}
-
-/**
- *
- * Extends {@link RunningAverage} by adding standard deviation too.
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev.java
- */
-trait RunningAverageAndStdDev extends RunningAverage {
-
- /** @return standard deviation of data */
- def getStandardDeviation: Double
-
- /**
- * @return a (possibly immutable) object whose average is the negative of this object's
- */
- def inverse: RunningAverageAndStdDev
-}
-
-
-class InvertedRunningAverage(private val delegate: RunningAverage) extends RunningAverage {
-
- override def addDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- override def removeDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- override def changeDatum(delta: Double) {
- throw new UnsupportedOperationException
- }
-
- override def getCount: Int = {
- delegate.getCount
- }
-
- override def getAverage: Double = {
- -delegate.getAverage
- }
-
- override def inverse: RunningAverage = {
- delegate
- }
-}
-
-
-/**
- *
- * A simple class that can keep track of a running average of a series of numbers. One can add to or remove
- * from the series, as well as update a datum in the series. The class does not actually keep track of the
- * series of values, just its running average, so it doesn't even matter if you remove/change a value that
- * wasn't added.
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.FullRunningAverage.java
- */
-class FullRunningAverage(private var count: Int = 0,
- private var average: Double = Double.NaN ) extends RunningAverage {
-
- /**
- * @param datum
- * new item to add to the running average
- */
- override def addDatum(datum: Double) {
- count += 1
- if (count == 1) {
- average = datum
- }
- else {
- average = average * (count - 1) / count + datum / count
- }
- }
-
- /**
- * @param datum
- * item to remove from the running average
- * @throws IllegalStateException
- * if count is 0
- */
- override def removeDatum(datum: Double) {
- if (count == 0) {
- throw new IllegalStateException
- }
- count -= 1
- if (count == 0) {
- average = Double.NaN
- }
- else {
- average = average * (count + 1) / count - datum / count
- }
- }
-
- /**
- * @param delta
- * amount by which to change a datum in the running average
- * @throws IllegalStateException
- * if count is 0
- */
- override def changeDatum(delta: Double) {
- if (count == 0) {
- throw new IllegalStateException
- }
- average += delta / count
- }
-
- override def getCount: Int = {
- count
- }
-
- override def getAverage: Double = {
- average
- }
-
- override def inverse: RunningAverage = {
- new InvertedRunningAverage(this)
- }
-
- override def toString: String = {
- String.valueOf(average)
- }
-}
-
-
-/**
- *
- * Extends {@link FullRunningAverage} to add a running standard deviation computation.
- * Uses Welford's method, as described at http://www.johndcook.com/standard_deviation.html
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev.java
- */
-class FullRunningAverageAndStdDev(private var count: Int = 0,
- private var average: Double = 0.0,
- private var mk: Double = 0.0,
- private var sk: Double = 0.0) extends FullRunningAverage with RunningAverageAndStdDev {
-
- var stdDev: Double = 0.0
-
- recomputeStdDev
-
- def getMk: Double = {
- mk
- }
-
- def getSk: Double = {
- sk
- }
-
- override def getStandardDeviation: Double = {
- stdDev
- }
-
- override def addDatum(datum: Double) {
- super.addDatum(datum)
- val count: Int = getCount
- if (count == 1) {
- mk = datum
- sk = 0.0
- }
- else {
- val oldmk: Double = mk
- val diff: Double = datum - oldmk
- mk += diff / count
- sk += diff * (datum - mk)
- }
- recomputeStdDev
- }
-
- override def removeDatum(datum: Double) {
- val oldCount: Int = getCount
- super.removeDatum(datum)
- val oldmk: Double = mk
- mk = (oldCount * oldmk - datum) / (oldCount - 1)
- sk -= (datum - mk) * (datum - oldmk)
- recomputeStdDev
- }
-
- /**
- * @throws UnsupportedOperationException
- */
- override def changeDatum(delta: Double) {
- throw new UnsupportedOperationException
- }
-
- private def recomputeStdDev {
- val count: Int = getCount
- stdDev = if (count > 1) Math.sqrt(sk / (count - 1)) else Double.NaN
- }
-
- override def inverse: RunningAverageAndStdDev = {
- new InvertedRunningAverageAndStdDev(this)
- }
-
- override def toString: String = {
- String.valueOf(String.valueOf(getAverage) + ',' + stdDev)
- }
-
-}
-
-
-/**
- *
- * @param delegate RunningAverageAndStdDev instance
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.InvertedRunningAverageAndStdDev.java
- */
-class InvertedRunningAverageAndStdDev(private val delegate: RunningAverageAndStdDev) extends RunningAverageAndStdDev {
-
- /**
- * @throws UnsupportedOperationException
- */
- override def addDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- /**
- * @throws UnsupportedOperationException
- */
-
- override def removeDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- /**
- * @throws UnsupportedOperationException
- */
- override def changeDatum(delta: Double) {
- throw new UnsupportedOperationException
- }
-
- override def getCount: Int = {
- delegate.getCount
- }
-
- override def getAverage: Double = {
- -delegate.getAverage
- }
-
- override def getStandardDeviation: Double = {
- delegate.getStandardDeviation
- }
-
- override def inverse: RunningAverageAndStdDev = {
- delegate
- }
-}
-
-
-
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala
deleted file mode 100644
index d421fa1..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package org.apache.mahout.classifier.stats
-
-import java.util
-import org.apache.commons.math3.stat.descriptive.moment.Mean // This is brought in by mahout-math
-import org.apache.mahout.math.{DenseMatrix, Matrix}
-import scala.collection.mutable
-import scala.collection.JavaConversions._
-
-/**
- *
- * Ported from org.apache.mahout.classifier.ConfusionMatrix.java
- *
- * The ConfusionMatrix Class stores the result of Classification of a Test Dataset.
- *
- * The fact of whether there is a default is not stored. A row of zeros is the only indicator that there is no default.
- *
- * See http://en.wikipedia.org/wiki/Confusion_matrix for background
- *
- *
- * @param labels The labels to consider for classification
- * @param defaultLabel default unknown label
- */
-class ConfusionMatrix(private var labels: util.Collection[String] = null,
- private var defaultLabel: String = "unknown") {
- /**
- * Matrix Constructor
- */
-// def this(m: Matrix) {
-// this()
-// confusionMatrix = Array.ofDim[Int](m.numRows, m.numRows)
-// setMatrix(m)
-// }
-
- // val LOG: Logger = LoggerFactory.getLogger(classOf[ConfusionMatrix])
-
- var confusionMatrix = Array.ofDim[Int](labels.size + 1, labels.size + 1)
-
- val labelMap = new mutable.HashMap[String,Integer]()
-
- var samples: Int = 0
-
- var i: Integer = 0
- for (label <- labels) {
- labelMap.put(label, i)
- i+=1
- }
- labelMap.put(defaultLabel, i)
-
-
- def getConfusionMatrix: Array[Array[Int]] = confusionMatrix
-
- def getLabels = labelMap.keys.toList
-
- def numLabels: Int = labelMap.size
-
- def getAccuracy(label: String): Double = {
- val labelId: Int = labelMap(label)
- var labelTotal: Int = 0
- var correct: Int = 0
- for (i <- 0 until numLabels) {
- labelTotal += confusionMatrix(labelId)(i)
- if (i == labelId) {
- correct += confusionMatrix(labelId)(i)
- }
- }
-
- 100.0 * correct / labelTotal
- }
-
- def getAccuracy: Double = {
- var total: Int = 0
- var correct: Int = 0
- for (i <- 0 until numLabels) {
- for (j <- 0 until numLabels) {
- total += confusionMatrix(i)(j)
- if (i == j) {
- correct += confusionMatrix(i)(j)
- }
- }
- }
-
- 100.0 * correct / total
- }
-
- /** Sum of true positives and false negatives */
- private def getActualNumberOfTestExamplesForClass(label: String): Int = {
- val labelId: Int = labelMap(label)
- var sum: Int = 0
- for (i <- 0 until numLabels) {
- sum += confusionMatrix(labelId)(i)
- }
- sum
- }
-
- def getPrecision(label: String): Double = {
- val labelId: Int = labelMap(label)
- val truePositives: Int = confusionMatrix(labelId)(labelId)
- var falsePositives: Int = 0
-
- for (i <- 0 until numLabels) {
- if (i != labelId) {
- falsePositives += confusionMatrix(i)(labelId)
- }
- }
-
- if (truePositives + falsePositives == 0) {
- 0
- } else {
- truePositives.asInstanceOf[Double] / (truePositives + falsePositives)
- }
- }
-
-
- def getWeightedPrecision: Double = {
- val precisions: Array[Double] = new Array[Double](numLabels)
- val weights: Array[Double] = new Array[Double](numLabels)
- var index: Int = 0
- for (label <- labelMap.keys) {
- precisions(index) = getPrecision(label)
- weights(index) = getActualNumberOfTestExamplesForClass(label)
- index += 1
- }
- new Mean().evaluate(precisions, weights)
- }
-
- def getRecall(label: String): Double = {
- val labelId: Int = labelMap(label)
- val truePositives: Int = confusionMatrix(labelId)(labelId)
- var falseNegatives: Int = 0
- for (i <- 0 until numLabels) {
- if (i != labelId) {
- falseNegatives += confusionMatrix(labelId)(i)
- }
- }
-
- if (truePositives + falseNegatives == 0) {
- 0
- } else {
- truePositives.asInstanceOf[Double] / (truePositives + falseNegatives)
- }
- }
-
- def getWeightedRecall: Double = {
- val recalls: Array[Double] = new Array[Double](numLabels)
- val weights: Array[Double] = new Array[Double](numLabels)
- var index: Int = 0
- for (label <- labelMap.keys) {
- recalls(index) = getRecall(label)
- weights(index) = getActualNumberOfTestExamplesForClass(label)
- index += 1
- }
- new Mean().evaluate(recalls, weights)
- }
-
- def getF1score(label: String): Double = {
- val precision: Double = getPrecision(label)
- val recall: Double = getRecall(label)
- if (precision + recall == 0) {
- 0
- } else {
- 2 * precision * recall / (precision + recall)
- }
- }
-
- def getWeightedF1score: Double = {
- val f1Scores: Array[Double] = new Array[Double](numLabels)
- val weights: Array[Double] = new Array[Double](numLabels)
- var index: Int = 0
- for (label <- labelMap.keys) {
- f1Scores(index) = getF1score(label)
- weights(index) = getActualNumberOfTestExamplesForClass(label)
- index += 1
- }
- new Mean().evaluate(f1Scores, weights)
- }
-
- def getReliability: Double = {
- var count: Int = 0
- var accuracy: Double = 0
- for (label <- labelMap.keys) {
- if (!(label == defaultLabel)) {
- accuracy += getAccuracy(label)
- }
- count += 1
- }
- accuracy / count
- }
-
- /**
- * Accuracy v.s. randomly classifying all samples.
- * kappa() = (totalAccuracy() - randomAccuracy()) / (1 - randomAccuracy())
- * Cohen, Jacob. 1960. A coefficient of agreement for nominal scales.
- * Educational And Psychological Measurement 20:37-46.
- *
- * Formula and variable names from:
- * http://www.yale.edu/ceo/OEFS/Accuracy.pdf
- *
- * @return double
- */
- def getKappa: Double = {
- var a: Double = 0.0
- var b: Double = 0.0
- for (i <- confusionMatrix.indices) {
- a += confusionMatrix(i)(i)
- var br: Int = 0
- for (j <- confusionMatrix.indices) {
- br += confusionMatrix(i)(j)
- }
- var bc: Int = 0
- //TODO: verify this as an iterator
- for (vec <- confusionMatrix) {
- bc += vec(i)
- }
- b += br * bc
- }
- (samples * a - b) / (samples * samples - b)
- }
-
- def getCorrect(label: String): Int = {
- val labelId: Int = labelMap(label)
- confusionMatrix(labelId)(labelId)
- }
-
- def getTotal(label: String): Int = {
- val labelId: Int = labelMap(label)
- var labelTotal: Int = 0
- for (i <- 0 until numLabels) {
- labelTotal += confusionMatrix(labelId)(i)
- }
- labelTotal
- }
-
- /**
- * Standard deviation of normalized producer accuracy
- * Not a standard score
- * @return double
- */
- def getNormalizedStats: RunningAverageAndStdDev = {
- val summer = new FullRunningAverageAndStdDev()
- for (d <- confusionMatrix.indices) {
- var total: Double = 0.0
- for (j <- confusionMatrix.indices) {
- total += confusionMatrix(d)(j)
- }
- summer.addDatum(confusionMatrix(d)(d) / (total + 0.000001))
- }
- summer
- }
-
- def addInstance(correctLabel: String, classifiedResult: ClassifierResult): Unit = {
- samples += 1
- incrementCount(correctLabel, classifiedResult.getLabel)
- }
-
- def addInstance(correctLabel: String, classifiedLabel: String): Unit = {
- samples += 1
- incrementCount(correctLabel, classifiedLabel)
- }
-
- def getCount(correctLabel: String, classifiedLabel: String): Int = {
- if (!labelMap.containsKey(correctLabel)) {
- // LOG.warn("Label {} did not appear in the training examples", correctLabel)
- return 0
- }
- assert(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel)
- val correctId: Int = labelMap(correctLabel)
- val classifiedId: Int = labelMap(classifiedLabel)
- confusionMatrix(correctId)(classifiedId)
- }
-
- def putCount(correctLabel: String, classifiedLabel: String, count: Int): Unit = {
- if (!labelMap.containsKey(correctLabel)) {
- // LOG.warn("Label {} did not appear in the training examples", correctLabel)
- return
- }
- assert(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel)
- val correctId: Int = labelMap(correctLabel)
- val classifiedId: Int = labelMap(classifiedLabel)
- if (confusionMatrix(correctId)(classifiedId) == 0.0 && count != 0) {
- samples += 1
- }
- confusionMatrix(correctId)(classifiedId) = count
- }
-
- def incrementCount(correctLabel: String, classifiedLabel: String, count: Int): Unit = {
- putCount(correctLabel, classifiedLabel, count + getCount(correctLabel, classifiedLabel))
- }
-
- def incrementCount(correctLabel: String, classifiedLabel: String): Unit = {
- incrementCount(correctLabel, classifiedLabel, 1)
- }
-
- def getDefaultLabel: String = {
- defaultLabel
- }
-
- def merge(b: ConfusionMatrix): ConfusionMatrix = {
- assert(labelMap.size == b.getLabels.size, "The label sizes do not match")
- for (correctLabel <- this.labelMap.keys) {
- for (classifiedLabel <- this.labelMap.keys) {
- incrementCount(correctLabel, classifiedLabel, b.getCount(correctLabel, classifiedLabel))
- }
- }
- this
- }
-
- def getMatrix: Matrix = {
- val length: Int = confusionMatrix.length
- val m: Matrix = new DenseMatrix(length, length)
-
- val labels: java.util.HashMap[String, Integer] = new java.util.HashMap()
-
- for (r <- 0 until length) {
- for (c <- 0 until length) {
- m.set(r, c, confusionMatrix(r)(c))
- }
- }
-
- for (entry <- labelMap.entrySet) {
- labels.put(entry.getKey, entry.getValue)
- }
- m.setRowLabelBindings(labels)
- m.setColumnLabelBindings(labels)
-
- m
- }
-
- def setMatrix(m: Matrix) : Unit = {
- val length: Int = confusionMatrix.length
- if (m.numRows != m.numCols) {
- throw new IllegalArgumentException("ConfusionMatrix: matrix(" + m.numRows + ',' + m.numCols + ") must be square")
- }
-
- for (r <- 0 until length) {
- for (c <- 0 until length) {
- confusionMatrix(r)(c) = Math.round(m.get(r, c)).toInt
- }
- }
-
- var labels = m.getRowLabelBindings
- if (labels == null) {
- labels = m.getColumnLabelBindings
- }
-
- if (labels != null) {
- val sorted: Array[String] = sortLabels(labels)
- verifyLabels(length, sorted)
- labelMap.clear
- for (i <- 0 until length) {
- labelMap.put(sorted(i), i)
- }
- }
- }
-
- def verifyLabels(length: Int, sorted: Array[String]): Unit = {
- assert(sorted.length == length, "One label, one row")
- for (i <- 0 until length) {
- if (sorted(i) == null) {
- assert(assertion = false, "One label, one row")
- }
- }
- }
-
- def sortLabels(labels: java.util.Map[String, Integer]): Array[String] = {
- val sorted: Array[String] = new Array[String](labels.size)
- for (entry <- labels.entrySet) {
- sorted(entry.getValue) = entry.getKey
- }
-
- sorted
- }
-
- /**
- * This is overloaded. toString() is not a formatted report you print for a manager :)
- * Assume that if there are no default assignments, the default feature was not used
- */
- override def toString: String = {
-
- val returnString: StringBuilder = new StringBuilder(200)
-
- returnString.append("=======================================================").append('\n')
- returnString.append("Confusion Matrix\n")
- returnString.append("-------------------------------------------------------").append('\n')
-
- val unclassified: Int = getTotal(defaultLabel)
-
- for (entry <- this.labelMap.entrySet) {
- if (!((entry.getKey == defaultLabel) && unclassified == 0)) {
- returnString.append(getSmallLabel(entry.getValue) + " ").append('\t')
- }
- }
-
- returnString.append("<--Classified as").append('\n')
-
- for (entry <- this.labelMap.entrySet) {
- if (!((entry.getKey == defaultLabel) && unclassified == 0)) {
- val correctLabel: String = entry.getKey
- var labelTotal: Int = 0
-
- for (classifiedLabel <- this.labelMap.keySet) {
- if (!((classifiedLabel == defaultLabel) && unclassified == 0)) {
- returnString.append(Integer.toString(getCount(correctLabel, classifiedLabel)) + " ")
- .append('\t')
- labelTotal += getCount(correctLabel, classifiedLabel)
- }
- }
- returnString.append(" | ").append(String.valueOf(labelTotal) + " ")
- .append('\t')
- .append(getSmallLabel(entry.getValue) + " ")
- .append(" = ")
- .append(correctLabel)
- .append('\n')
- }
- }
-
- if (unclassified > 0) {
- returnString.append("Default Category: ")
- .append(defaultLabel)
- .append(": ")
- .append(unclassified)
- .append('\n')
- }
- returnString.append('\n')
-
- returnString.toString()
- }
-
-
- def getSmallLabel(i: Int): String = {
- var value: Int = i
- val returnString: StringBuilder = new StringBuilder
- do {
- val n: Int = value % 26
- returnString.insert(0, ('a' + n).asInstanceOf[Char])
- value /= 26
- } while (value > 0)
-
- returnString.toString()
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/common/io/GenericMatrixKryoSerializer.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/common/io/GenericMatrixKryoSerializer.scala b/math-scala/src/main/scala/org/apache/mahout/common/io/GenericMatrixKryoSerializer.scala
deleted file mode 100644
index 534d37c..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/common/io/GenericMatrixKryoSerializer.scala
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.common.io
-
-import com.esotericsoftware.kryo.io.{Input, Output}
-import com.esotericsoftware.kryo.{Kryo, Serializer}
-import org.apache.log4j.Logger
-import org.apache.mahout.logging._
-import org.apache.mahout.math._
-import org.apache.mahout.math.flavor.TraversingStructureEnum
-import org.apache.mahout.math.scalabindings.RLikeOps._
-import org.apache.mahout.math.scalabindings._
-
-import scala.collection.JavaConversions._
-
-object GenericMatrixKryoSerializer {
-
- private implicit final val log = Logger.getLogger(classOf[GenericMatrixKryoSerializer])
-
-}
-
-/** Serializes Sparse or Dense in-core generic matrix (row-wise or column-wise backed) */
-class GenericMatrixKryoSerializer extends Serializer[Matrix] {
-
- import GenericMatrixKryoSerializer._
-
- override def write(kryo: Kryo, output: Output, mx: Matrix): Unit = {
-
- debug(s"Writing mx of type ${mx.getClass.getName}")
-
- val structure = mx.getFlavor.getStructure
-
- // Write structure bit
- output.writeInt(structure.ordinal(), true)
-
- // Write geometry
- output.writeInt(mx.nrow, true)
- output.writeInt(mx.ncol, true)
-
- // Write in most efficient traversal order (using backing vectors perhaps)
- structure match {
- case TraversingStructureEnum.COLWISE => writeRowWise(kryo, output, mx.t)
- case TraversingStructureEnum.SPARSECOLWISE => writeSparseRowWise(kryo, output, mx.t)
- case TraversingStructureEnum.SPARSEROWWISE => writeSparseRowWise(kryo, output, mx)
- case TraversingStructureEnum.VECTORBACKED => writeVectorBacked(kryo, output, mx)
- case _ => writeRowWise(kryo, output, mx)
- }
-
- }
-
- private def writeVectorBacked(kryo: Kryo, output: Output, mx: Matrix) {
-
- require(mx != null)
-
- // At this point we are just doing some vector-backed classes individually. TODO: create
- // api to obtain vector-backed matrix data.
- kryo.writeClass(output, mx.getClass)
- mx match {
- case mxD: DiagonalMatrix => kryo.writeObject(output, mxD.diagv)
- case mxS: DenseSymmetricMatrix => kryo.writeObject(output, dvec(mxS.getData))
- case mxT: UpperTriangular => kryo.writeObject(output, dvec(mxT.getData))
- case _ => throw new IllegalArgumentException(s"Unsupported matrix type:${mx.getClass.getName}")
- }
- }
-
- private def readVectorBacked(kryo: Kryo, input: Input, nrow: Int, ncol: Int) = {
-
- // We require vector-backed matrices to have vector-parameterized constructor to construct.
- val clazz = kryo.readClass(input).getType
-
- debug(s"Deserializing vector-backed mx of type ${clazz.getName}.")
-
- clazz.getConstructor(classOf[Vector]).newInstance(kryo.readObject(input, classOf[Vector])).asInstanceOf[Matrix]
- }
-
- private def writeRowWise(kryo: Kryo, output: Output, mx: Matrix): Unit = {
- for (row <- mx) kryo.writeObject(output, row)
- }
-
- private def readRows(kryo: Kryo, input: Input, nrow: Int) = {
- Array.tabulate(nrow) { _ => kryo.readObject(input, classOf[Vector])}
- }
-
- private def readSparseRows(kryo: Kryo, input: Input) = {
-
- // Number of slices
- val nslices = input.readInt(true)
-
- Array.tabulate(nslices) { _ =>
- input.readInt(true) -> kryo.readObject(input, classOf[Vector])
- }
- }
-
- private def writeSparseRowWise(kryo: Kryo, output: Output, mx: Matrix): Unit = {
-
- val nslices = mx.numSlices()
-
- output.writeInt(nslices, true)
-
- var actualNSlices = 0
- for (row <- mx.iterateNonEmpty()) {
- output.writeInt(row.index(), true)
- kryo.writeObject(output, row.vector())
- actualNSlices += 1
- }
-
- require(nslices == actualNSlices, "Number of slices reported by Matrix.numSlices() was different from actual " +
- "slice iterator size.")
- }
-
- override def read(kryo: Kryo, input: Input, mxClass: Class[Matrix]): Matrix = {
-
- // Read structure hint
- val structure = TraversingStructureEnum.values()(input.readInt(true))
-
- // Read geometry
- val nrow = input.readInt(true)
- val ncol = input.readInt(true)
-
- debug(s"read matrix geometry: $nrow x $ncol.")
-
- structure match {
-
- // Sparse or dense column wise
- case TraversingStructureEnum.COLWISE =>
- val cols = readRows(kryo, input, ncol)
-
- if (!cols.isEmpty && cols.head.isDense)
- dense(cols).t
- else {
- debug("Deserializing as SparseRowMatrix.t (COLWISE).")
- new SparseRowMatrix(ncol, nrow, cols, true, false).t
- }
-
- // transposed SparseMatrix case
- case TraversingStructureEnum.SPARSECOLWISE =>
- val cols = readSparseRows(kryo, input)
- val javamap = new java.util.HashMap[Integer, Vector]((cols.size << 1) + 1)
- cols.foreach { case (idx, vec) => javamap.put(idx, vec)}
-
- debug("Deserializing as SparseMatrix.t (SPARSECOLWISE).")
- new SparseMatrix(ncol, nrow, javamap, true).t
-
- // Sparse Row-wise -- this will be created as a SparseMatrix.
- case TraversingStructureEnum.SPARSEROWWISE =>
- val rows = readSparseRows(kryo, input)
- val javamap = new java.util.HashMap[Integer, Vector]((rows.size << 1) + 1)
- rows.foreach { case (idx, vec) => javamap.put(idx, vec)}
-
- debug("Deserializing as SparseMatrix (SPARSEROWWISE).")
- new SparseMatrix(nrow, ncol, javamap, true)
- case TraversingStructureEnum.VECTORBACKED =>
-
- debug("Deserializing vector-backed...")
- readVectorBacked(kryo, input, nrow, ncol)
-
- // By default, read row-wise.
- case _ =>
- val cols = readRows(kryo, input, nrow)
- // this still copies a lot of stuff...
- if (!cols.isEmpty && cols.head.isDense) {
-
- debug("Deserializing as DenseMatrix.")
- dense(cols)
- } else {
-
- debug("Deserializing as SparseRowMatrix(default).")
- new SparseRowMatrix(nrow, ncol, cols, true, false)
- }
- }
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/common/io/VectorKryoSerializer.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/common/io/VectorKryoSerializer.scala b/math-scala/src/main/scala/org/apache/mahout/common/io/VectorKryoSerializer.scala
deleted file mode 100644
index 3cc537c..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/common/io/VectorKryoSerializer.scala
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.common.io
-
-import com.esotericsoftware.kryo.io.{Input, Output}
-import com.esotericsoftware.kryo.{Kryo, Serializer}
-import org.apache.mahout.logging._
-import org.apache.mahout.math._
-import org.apache.mahout.math.scalabindings.RLikeOps._
-
-import scala.collection.JavaConversions._
-
-
-object VectorKryoSerializer {
-
- final val FLAG_DENSE: Int = 0x01
- final val FLAG_SEQUENTIAL: Int = 0x02
- final val FLAG_NAMED: Int = 0x04
- final val FLAG_LAX_PRECISION: Int = 0x08
-
- private final implicit val log = getLog(classOf[VectorKryoSerializer])
-
-}
-
-class VectorKryoSerializer(val laxPrecision: Boolean = false) extends Serializer[Vector] {
-
- import VectorKryoSerializer._
-
- override def write(kryo: Kryo, output: Output, vector: Vector): Unit = {
-
- require(vector != null)
-
- trace(s"Serializing vector of ${vector.getClass.getName} class.")
-
- // Write length
- val len = vector.length
- output.writeInt(len, true)
-
- // Interrogate vec properties
- val dense = vector.isDense
- val sequential = vector.isSequentialAccess
- val named = vector.isInstanceOf[NamedVector]
-
- var flag = 0
-
- if (dense) {
- flag |= FLAG_DENSE
- } else if (sequential) {
- flag |= FLAG_SEQUENTIAL
- }
-
- if (vector.isInstanceOf[NamedVector]) {
- flag |= FLAG_NAMED
- }
-
- if (laxPrecision) flag |= FLAG_LAX_PRECISION
-
- // Write flags
- output.writeByte(flag)
-
- // Write name if needed
- if (named) output.writeString(vector.asInstanceOf[NamedVector].getName)
-
- dense match {
-
- // Dense vector.
- case true =>
-
- laxPrecision match {
- case true =>
- for (i <- 0 until vector.length) output.writeFloat(vector(i).toFloat)
- case _ =>
- for (i <- 0 until vector.length) output.writeDouble(vector(i))
- }
- case _ =>
-
- // Turns out getNumNonZeroElements must check every element if it is indeed non-zero. The
- // iterateNonZeros() on the other hand doesn't do that, so that's all inconsistent right
- // now. so we'll just auto-terminate.
- val iter = vector.nonZeroes.toIterator.filter(_.get() != 0.0)
-
- sequential match {
-
- // Delta encoding
- case true =>
-
- var idx = 0
- laxPrecision match {
- case true =>
- while (iter.hasNext) {
- val el = iter.next()
- output.writeFloat(el.toFloat)
- output.writeInt(el.index() - idx, true)
- idx = el.index
- }
- // Terminate delta encoding.
- output.writeFloat(0.0.toFloat)
- case _ =>
- while (iter.hasNext) {
- val el = iter.next()
- output.writeDouble(el.get())
- output.writeInt(el.index() - idx, true)
- idx = el.index
- }
- // Terminate delta encoding.
- output.writeDouble(0.0)
- }
-
- // Random access.
- case _ =>
-
- laxPrecision match {
-
- case true =>
- iter.foreach { el =>
- output.writeFloat(el.get().toFloat)
- output.writeInt(el.index(), true)
- }
- // Terminate random access with 0.0 value.
- output.writeFloat(0.0.toFloat)
- case _ =>
- iter.foreach { el =>
- output.writeDouble(el.get())
- output.writeInt(el.index(), true)
- }
- // Terminate random access with 0.0 value.
- output.writeDouble(0.0)
- }
-
- }
-
- }
- }
-
- override def read(kryo: Kryo, input: Input, vecClass: Class[Vector]): Vector = {
-
- val len = input.readInt(true)
- val flags = input.readByte().toInt
- val name = if ((flags & FLAG_NAMED) != 0) Some(input.readString()) else None
-
- val vec: Vector = flags match {
-
- // Dense
- case _: Int if (flags & FLAG_DENSE) != 0 =>
-
- trace(s"Deserializing dense vector.")
-
- if ((flags & FLAG_LAX_PRECISION) != 0) {
- new DenseVector(len) := { _ => input.readFloat()}
- } else {
- new DenseVector(len) := { _ => input.readDouble()}
- }
-
- // Sparse case.
- case _ =>
-
- flags match {
-
- // Sequential.
- case _: Int if (flags & FLAG_SEQUENTIAL) != 0 =>
-
- trace("Deserializing as sequential sparse vector.")
-
- val v = new SequentialAccessSparseVector(len)
- var idx = 0
- var stop = false
-
- if ((flags & FLAG_LAX_PRECISION) != 0) {
-
- while (!stop) {
- val value = input.readFloat()
- if (value == 0.0) {
- stop = true
- } else {
- idx += input.readInt(true)
- v(idx) = value
- }
- }
- } else {
- while (!stop) {
- val value = input.readDouble()
- if (value == 0.0) {
- stop = true
- } else {
- idx += input.readInt(true)
- v(idx) = value
- }
- }
- }
- v
-
- // Random access
- case _ =>
-
- trace("Deserializing as random access vector.")
-
- // Read pairs until we see 0.0 value. Prone to corruption attacks obviously.
- val v = new RandomAccessSparseVector(len)
- var stop = false
- if ((flags & FLAG_LAX_PRECISION) != 0) {
- while (! stop ) {
- val value = input.readFloat()
- if ( value == 0.0 ) {
- stop = true
- } else {
- val idx = input.readInt(true)
- v(idx) = value
- }
- }
- } else {
- while (! stop ) {
- val value = input.readDouble()
- if (value == 0.0) {
- stop = true
- } else {
- val idx = input.readInt(true)
- v(idx) = value
- }
- }
- }
- v
- }
- }
-
- name.map{name =>
-
- trace(s"Recovering named vector's name $name.")
-
- new NamedVector(vec, name)
- }
- .getOrElse(vec)
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala b/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala
deleted file mode 100644
index 32515f1..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.drivers
-
-import org.apache.mahout.math.drm.DistributedContext
-
-/** Extended by a platform specific version of this class to create a Mahout CLI driver. */
-abstract class MahoutDriver {
-
- implicit protected var mc: DistributedContext = _
- implicit protected var parser: MahoutOptionParser = _
-
- var _useExistingContext: Boolean = false // used in the test suite to reuse one context per suite
-
- /** must be overriden to setup the DistributedContext mc*/
- protected def start() : Unit
-
- /** Override (optionally) for special cleanup */
- protected def stop(): Unit = {
- if (!_useExistingContext) mc.close
- }
-
- /** This is where you do the work, call start first, then before exiting call stop */
- protected def process(): Unit
-
- /** Parse command line and call process */
- def main(args: Array[String]): Unit
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala b/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala
deleted file mode 100644
index d3723a2..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.drivers
-
-import scopt.OptionParser
-
-import scala.collection.immutable
-
-/**
- * Defines oft-repeated options and their parsing. Provides the option groups and parsing helper methods to
- * keep both standarized.
- * @param programName Name displayed in help message, the name by which the driver is invoked.
- * @note options are engine neutral by convention. See the engine specific extending class for
- * to add Spark or other engine options.
- */
-class MahoutOptionParser(programName: String) extends OptionParser[Map[String, Any]](programName: String) {
-
- // build options from some stardard CLI param groups
- // Note: always put the driver specific options at the last so they can override any previous options!
- var opts = Map.empty[String, Any]
-
- override def showUsageOnError = true
-
- def parseIOOptions(numInputs: Int = 1) = {
- opts = opts ++ MahoutOptionParser.FileIOOptions
- note("Input, output options")
- opt[String]('i', "input") required() action { (x, options) =>
- options + ("input" -> x)
- } text ("Input path, may be a filename, directory name, or comma delimited list of HDFS supported URIs" +
- " (required)")
-
- if (numInputs == 2) {
- opt[String]("input2") abbr ("i2") action { (x, options) =>
- options + ("input2" -> x)
- } text ("Secondary input path for cross-similarity calculation, same restrictions as \"--input\" " +
- "(optional). Default: empty.")
- }
-
- opt[String]('o', "output") required() action { (x, options) =>
- if (x.endsWith("/")) {
- options + ("output" -> x)
- } else {
- options + ("output" -> (x + "/"))
- }
- } text ("Path for output directory, any HDFS supported URI (required)")
-
- }
-
- def parseGenericOptions() = {
- opts = opts ++ MahoutOptionParser.GenericOptions
- opt[Int]("randomSeed") abbr ("rs") action { (x, options) =>
- options + ("randomSeed" -> x)
- } validate { x =>
- if (x > 0) success else failure("Option --randomSeed must be > 0")
- }
-
- //output both input IndexedDatasets
- opt[Unit]("writeAllDatasets") hidden() action { (_, options) =>
- options + ("writeAllDatasets" -> true)
- }//Hidden option, though a user might want this.
- }
-
- def parseElementInputSchemaOptions() = {
- //Input text file schema--not driver specific but input data specific, elements input,
- // not rows of IndexedDatasets
- opts = opts ++ MahoutOptionParser.TextDelimitedElementsOptions
- note("\nInput text file schema options:")
- opt[String]("inDelim") abbr ("id") text ("Input delimiter character (optional). Default: \"[ ,\\t]\"") action {
- (x, options) =>
- options + ("inDelim" -> x)
- }
-
- opt[String]("filter1") abbr ("f1") action { (x, options) =>
- options + ("filter1" -> x)
- } text ("String (or regex) whose presence indicates a datum for the primary item set (optional). " +
- "Default: no filter, all data is used")
-
- opt[String]("filter2") abbr ("f2") action { (x, options) =>
- options + ("filter2" -> x)
- } text ("String (or regex) whose presence indicates a datum for the secondary item set (optional). " +
- "If not present no secondary dataset is collected")
-
- opt[Int]("rowIDColumn") abbr ("rc") action { (x, options) =>
- options + ("rowIDColumn" -> x)
- } text ("Column number (0 based Int) containing the row ID string (optional). Default: 0") validate {
- x =>
- if (x >= 0) success else failure("Option --rowIDColNum must be >= 0")
- }
-
- opt[Int]("itemIDColumn") abbr ("ic") action { (x, options) =>
- options + ("itemIDColumn" -> x)
- } text ("Column number (0 based Int) containing the item ID string (optional). Default: 1") validate {
- x =>
- if (x >= 0) success else failure("Option --itemIDColNum must be >= 0")
- }
-
- opt[Int]("filterColumn") abbr ("fc") action { (x, options) =>
- options + ("filterColumn" -> x)
- } text ("Column number (0 based Int) containing the filter string (optional). Default: -1 for no " +
- "filter") validate { x =>
- if (x >= -1) success else failure("Option --filterColNum must be >= -1")
- }
-
- note("\nUsing all defaults the input is expected of the form: \"userID<tab>itemId\" or" +
- " \"userID<tab>itemID<tab>any-text...\" and all rows will be used")
-
- //check for column consistency
- checkConfig { options: Map[String, Any] =>
- if (options("filterColumn").asInstanceOf[Int] == options("itemIDColumn").asInstanceOf[Int]
- || options("filterColumn").asInstanceOf[Int] == options("rowIDColumn").asInstanceOf[Int]
- || options("rowIDColumn").asInstanceOf[Int] == options("itemIDColumn").asInstanceOf[Int])
- failure("The row, item, and filter positions must be unique.") else success
- }
-
- //check for filter consistency
- checkConfig { options: Map[String, Any] =>
- if (options("filter1").asInstanceOf[String] != null.asInstanceOf[String]
- && options("filter2").asInstanceOf[String] != null.asInstanceOf[String]
- && options("filter1").asInstanceOf[String] == options("filter2").asInstanceOf[String])
- failure ("If using filters they must be unique.") else success
- }
-
- }
-
- def parseFileDiscoveryOptions() = {
- //File finding strategy--not driver specific
- opts = opts ++ MahoutOptionParser.FileDiscoveryOptions
- note("\nFile discovery options:")
- opt[Unit]('r', "recursive") action { (_, options) =>
- options + ("recursive" -> true)
- } text ("Searched the -i path recursively for files that match --filenamePattern (optional), Default: false")
-
- opt[String]("filenamePattern") abbr ("fp") action { (x, options) =>
- options + ("filenamePattern" -> x)
- } text ("Regex to match in determining input files (optional). Default: filename in the --input option " +
- "or \"^part-.*\" if --input is a directory")
-
- }
-
- def parseIndexedDatasetFormatOptions(notice: String = "\nOutput text file schema options:") = {
- opts = opts ++ MahoutOptionParser.TextDelimitedIndexedDatasetOptions
- note(notice)
- opt[String]("rowKeyDelim") abbr ("rd") action { (x, options) =>
- options + ("rowKeyDelim" -> x)
- } text ("Separates the rowID key from the vector values list (optional). Default: \"\\t\"")
-
- opt[String]("columnIdStrengthDelim") abbr ("cd") action { (x, options) =>
- options + ("columnIdStrengthDelim" -> x)
- } text ("Separates column IDs from their values in the vector values list (optional). Default: \":\"")
-
- opt[String]("elementDelim") abbr ("td") action { (x, options) =>
- options + ("elementDelim" -> x)
- } text ("Separates vector element values in the values list (optional). Default: \" \"")
-
- opt[Unit]("omitStrength") abbr ("os") action { (_, options) =>
- options + ("omitStrength" -> true)
- } text ("Do not write the strength to the output files (optional), Default: false.")
- note("This option is used to output indexable data for creating a search engine recommender.")
-
- note("\nDefault delimiters will produce output of the form: " +
- "\"itemID1<tab>itemID2:value2<space>itemID10:value10...\"")
- }
-
-}
-
-/**
- * Companion object defines default option groups for reference in any driver that needs them.
- * @note not all options are platform neutral so other platforms can add default options here if desired
- */
-object MahoutOptionParser {
-
- // set up the various default option groups
- final val GenericOptions = immutable.HashMap[String, Any](
- "randomSeed" -> System.currentTimeMillis().toInt,
- "writeAllDatasets" -> false)
-
- final val SparkOptions = immutable.HashMap[String, Any](
- "master" -> "local",
- "sparkExecutorMem" -> "",
- "appName" -> "Generic Spark App, Change this.")
-
- final val FileIOOptions = immutable.HashMap[String, Any](
- "input" -> null.asInstanceOf[String],
- "input2" -> null.asInstanceOf[String],
- "output" -> null.asInstanceOf[String])
-
- final val FileDiscoveryOptions = immutable.HashMap[String, Any](
- "recursive" -> false,
- "filenamePattern" -> "^part-.*")
-
- final val TextDelimitedElementsOptions = immutable.HashMap[String, Any](
- "rowIDColumn" -> 0,
- "itemIDColumn" -> 1,
- "filterColumn" -> -1,
- "filter1" -> null.asInstanceOf[String],
- "filter2" -> null.asInstanceOf[String],
- "inDelim" -> "[,\t ]")
-
- final val TextDelimitedIndexedDatasetOptions = immutable.HashMap[String, Any](
- "rowKeyDelim" -> "\t",
- "columnIdStrengthDelim" -> ":",
- "elementDelim" -> " ",
- "omitStrength" -> false)
-}
-
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/logging/package.scala b/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
deleted file mode 100644
index 15aa909..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout
-
-import org.apache.log4j.{Level, Priority, Logger}
-
-package object logging {
-
- /** Compute `expr` if debug is on, only */
- def debugDo[T](expr: => T)(implicit log: Logger): Option[T] = {
- if (log.isDebugEnabled) Some(expr)
- else None
- }
-
- /** Compute `expr` if trace is on, only */
- def traceDo[T](expr: => T)(implicit log: Logger): Option[T] = {
- if (log.isTraceEnabled) Some(expr) else None
- }
-
- /** Shorter, and lazy, versions of logging methods. Just declare log implicit. */
- def debug(msg: => AnyRef)(implicit log: Logger) { if (log.isDebugEnabled) log.debug(msg) }
-
- def debug(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isDebugEnabled()) log.debug(msg, t) }
-
- /** Shorter, and lazy, versions of logging methods. Just declare log implicit. */
- def trace(msg: => AnyRef)(implicit log: Logger) { if (log.isTraceEnabled) log.trace(msg) }
-
- def trace(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isTraceEnabled()) log.trace(msg, t) }
-
- def info(msg: => AnyRef)(implicit log: Logger) { if (log.isInfoEnabled) log.info(msg)}
-
- def info(msg: => AnyRef, t:Throwable)(implicit log: Logger) { if (log.isInfoEnabled) log.info(msg,t)}
-
- def warn(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.WARN)) log.warn(msg) }
-
- def warn(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.WARN)) error(msg, t) }
-
- def error(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.ERROR)) log.warn(msg) }
-
- def error(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.ERROR)) error(msg, t) }
-
- def fatal(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.FATAL)) log.fatal(msg) }
-
- def fatal(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.FATAL)) log.fatal(msg, t) }
-
- def getLog(name: String): Logger = Logger.getLogger(name)
-
- def getLog(clazz: Class[_]): Logger = Logger.getLogger(clazz)
-
- def mahoutLog :Logger = getLog("org.apache.mahout")
-
- def setLogLevel(l:Level)(implicit log:Logger) = {
- log.setLevel(l)
- }
-
- def setAdditivity(a:Boolean)(implicit log:Logger) = log.setAdditivity(a)
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala
deleted file mode 100644
index 244cefc..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.math.algorithms
-
-trait Fitter {
-
- // all models must have a fit method... signatures change.
- // leaving this as place holder incase we decide there are somethings all Models must have in common
-
-}

r***@apache.org

2018-06-27 14:51:56 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java b/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
deleted file mode 100644
index 7db836f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.email;
-
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.utils.io.ChunkedWriter;
-import org.apache.mahout.utils.io.ChunkedWrapper;
-import org.apache.mahout.utils.io.IOWriterWrapper;
-import org.apache.mahout.utils.io.WrappedWriter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.Writer;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Converts an mbox mail archive into a group of Hadoop Sequence Files with equal size. The archive may optionally be
- * gzipped or zipped. @see org.apache.mahout.text.SequenceFilesFromMailArchives
- */
-public class MailProcessor {
-
- private static final Pattern MESSAGE_START = Pattern.compile("^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
- private static final Pattern MESSAGE_ID_PREFIX = Pattern.compile("^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);
- // regular expressions used to parse individual messages
- public static final Pattern SUBJECT_PREFIX = Pattern.compile("^subject: (.*)$", Pattern.CASE_INSENSITIVE);
- //we need to have at least one character
- public static final Pattern FROM_PREFIX = Pattern.compile("^from: (\\S.*)$", Pattern.CASE_INSENSITIVE);
- public static final Pattern REFS_PREFIX = Pattern.compile("^references: (.*)$", Pattern.CASE_INSENSITIVE);
- public static final Pattern TO_PREFIX = Pattern.compile("^to: (.*)$", Pattern.CASE_INSENSITIVE);
-
- private final String prefix;
- private final MailOptions options;
- private final WrappedWriter writer;
-
- private static final Logger log = LoggerFactory.getLogger(MailProcessor.class);
-
- /**
- * Creates a {@code MailProcessor} that does not write to sequence files, but to a single text file.
- * This constructor is for debugging and testing purposes.
- */
- public MailProcessor(MailOptions options, String prefix, Writer writer) {
- this.writer = new IOWriterWrapper(writer);
- this.options = options;
- this.prefix = prefix;
- }
-
- /**
- * This is the main constructor of {@code MailProcessor}.
- */
- public MailProcessor(MailOptions options, String prefix, ChunkedWriter writer) {
- this.writer = new ChunkedWrapper(writer);
- this.options = options;
- this.prefix = prefix;
- }
-
- /**
- * Parses one complete mail archive, writing output to the {@code writer} constructor parameter.
- * @param mboxFile mail archive to parse
- * @return number of parsed mails
- * @throws IOException
- */
- public long parseMboxLineByLine(File mboxFile) throws IOException {
- long messageCount = 0;
- try {
- StringBuilder contents = new StringBuilder();
- // tmps used during mail message parsing
- StringBuilder body = new StringBuilder();
- Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
- Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
- String[] patternResults = new String[options.getPatternsToMatch().length];
- Matcher[] matchers = new Matcher[options.getPatternsToMatch().length];
- for (int i = 0; i < matchers.length; i++) {
- matchers[i] = options.getPatternsToMatch()[i].matcher("");
- }
-
- String messageId = null;
- boolean inBody = false;
- Pattern quotedTextPattern = options.getQuotedTextPattern();
- for (String nextLine : new FileLineIterable(mboxFile, options.getCharset(), false)) {
- if (options.isStripQuotedText() && quotedTextPattern.matcher(nextLine).find()) {
- continue;
- }
- for (int i = 0; i < matchers.length; i++) {
- Matcher matcher = matchers[i];
- matcher.reset(nextLine);
- if (matcher.matches()) {
- patternResults[i] = matcher.group(1);
- }
- }
-
- // only start appending body content after we've seen a message ID
- if (messageId != null) {
- // first, see if we hit the end of the message
- messageBoundaryMatcher.reset(nextLine);
- if (messageBoundaryMatcher.matches()) {
- // done parsing this message ... write it out
- String key = generateKey(mboxFile, prefix, messageId);
- //if this ordering changes, then also change FromEmailToDictionaryMapper
- writeContent(options.getSeparator(), contents, body, patternResults);
- writer.write(key, contents.toString());
- contents.setLength(0); // reset the buffer
- body.setLength(0);
-
- messageId = null;
- inBody = false;
- } else {
- if (inBody && options.isIncludeBody()) {
- if (!nextLine.isEmpty()) {
- body.append(nextLine).append(options.getBodySeparator());
- }
- } else {
- // first empty line we see after reading the message Id
- // indicates that we are in the body ...
- inBody = nextLine.isEmpty();
- }
- }
- } else {
- if (nextLine.length() > 14) {
- messageIdMatcher.reset(nextLine);
- if (messageIdMatcher.matches()) {
- messageId = messageIdMatcher.group(1);
- ++messageCount;
- }
- }
- }
- }
- // write the last message in the file if available
- if (messageId != null) {
- String key = generateKey(mboxFile, prefix, messageId);
- writeContent(options.getSeparator(), contents, body, patternResults);
- writer.write(key, contents.toString());
- contents.setLength(0); // reset the buffer
- }
- } catch (FileNotFoundException e) {
- // Skip file.
- log.warn("Unable to process non-existing file", e);
- }
- // TODO: report exceptions and continue;
- return messageCount;
- }
-
- protected static String generateKey(File mboxFile, String prefix, String messageId) {
- return prefix + File.separator + mboxFile.getName() + File.separator + messageId;
- }
-
- public String getPrefix() {
- return prefix;
- }
-
- public MailOptions getOptions() {
- return options;
- }
-
- private static void writeContent(String separator, StringBuilder contents, CharSequence body, String[] matches) {
- for (String match : matches) {
- if (match != null) {
- contents.append(match).append(separator);
- } else {
- contents.append(separator);
- }
- }
- contents.append('\n').append(body);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java b/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
deleted file mode 100644
index 473e86a..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.io;
-
-import java.io.IOException;
-
-/**
- * {@link ChunkedWriter} based implementation of the {@link WrappedWriter} interface.
- */
-public class ChunkedWrapper implements WrappedWriter {
-
- private final ChunkedWriter writer;
-
- public ChunkedWrapper(ChunkedWriter writer) {
- this.writer = writer;
- }
-
- @Override
- public void write(String key, String value) throws IOException {
- writer.write(key, value);
- }
-
- @Override
- public void close() throws IOException {
- writer.close();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java b/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
deleted file mode 100644
index 66cf15f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils.io;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-/**
- * Writes data splitted in multiple Hadoop sequence files of approximate equal size. The data must consist
- * of key-value pairs, both of them of String type. All sequence files are created in the same
- * directory and named "chunk-0", "chunk-1", etc.
- */
-public final class ChunkedWriter implements Closeable {
-
- private final int maxChunkSizeInBytes;
- private final Path output;
- private SequenceFile.Writer writer;
- private int currentChunkID;
- private int currentChunkSize;
- private final FileSystem fs;
- private final Configuration conf;
-
- /**
- * @param conf needed by Hadoop to know what filesystem implementation to use.
- * @param chunkSizeInMB approximate size of each file, in Megabytes.
- * @param output directory where the sequence files will be created.
- * @throws IOException
- */
- public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output) throws IOException {
- this.output = output;
- this.conf = conf;
- if (chunkSizeInMB > 1984) {
- chunkSizeInMB = 1984;
- }
- maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
- fs = FileSystem.get(output.toUri(), conf);
- currentChunkID = 0;
- writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), Text.class, Text.class);
- }
-
- private Path getPath(int chunkID) {
- return new Path(output, "chunk-" + chunkID);
- }
-
- /** Writes a new key-value pair, creating a new sequence file if necessary.*/
- public void write(String key, String value) throws IOException {
- if (currentChunkSize > maxChunkSizeInBytes) {
- Closeables.close(writer, false);
- currentChunkID++;
- writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), Text.class, Text.class);
- currentChunkSize = 0;
- }
-
- Text keyT = new Text(key);
- Text valueT = new Text(value);
- currentChunkSize += keyT.getBytes().length + valueT.getBytes().length; // Overhead
- writer.append(keyT, valueT);
- }
-
- @Override
- public void close() throws IOException {
- Closeables.close(writer, false);
- }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java b/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
deleted file mode 100644
index b7c3d42..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.io;
-
-import java.io.IOException;
-import java.io.Writer;
-/**
- * Implementation of the {@link WrappedWriter} interface based on {@link java.io.Writer}.
- */
-public class IOWriterWrapper implements WrappedWriter {
-
- private final Writer writer;
-
- public IOWriterWrapper(Writer writer) {
- this.writer = writer;
- }
-
- /** Writes a new key and value, separating them with one space. The value must end with a
- * new line or some other delimiter, as it is not automatically added by this method
- */
- @Override
- public void write(String key, String value) throws IOException {
- writer.write(key + ' ' + value);
- }
-
- @Override
- public void close() throws IOException {
- writer.close();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java b/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
deleted file mode 100644
index b9900e9..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-/**
- * Convenience class for wrapping either a java.io.Writer or a SequenceFile.Writer with some basic functionality
- */
-public interface WrappedWriter extends Closeable {
-
- /** Writes a new key-value pair.*/
- void write(String key, String value) throws IOException;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java b/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
deleted file mode 100644
index 964c8cc..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.utils.nlp.collocations.llr;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetEncoder;
-import java.nio.charset.CodingErrorAction;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.util.bloom.Filter;
-import org.apache.hadoop.util.bloom.Key;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-/**
- * Emits tokens based on bloom filter membership.
- */
-public final class BloomTokenFilter extends TokenFilter {
-
- private final Filter filter;
- private final CharTermAttribute termAtt;
- private final CharsetEncoder encoder;
- private final Key key;
- private final boolean keepMembers;
-
- /**
- * @param filter tokens will be checked for membership in this bloom filter
- * @param in the tokenstream to read.
- * @param keepMembers keep memoers of the bloom filter? If true works like
- * a whitelist and members found in the list are kept and all others are
- * dropped. If false works like a stoplist and members found in the
- * filter are dropped all others are kept.
- */
- public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) {
- super(in);
- this.filter = filter;
- this.keepMembers = keepMembers;
- this.key = new Key();
- this.termAtt = addAttribute(CharTermAttribute.class);
- this.encoder = Charsets.UTF_8.newEncoder().
- onMalformedInput(CodingErrorAction.REPORT).
- onUnmappableCharacter(CodingErrorAction.REPORT);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- while (input.incrementToken()) {
- ByteBuffer bytes = encoder.encode(CharBuffer.wrap(termAtt.buffer(), 0, termAtt.length()));
- key.set(bytes.array(), 1.0f);
- boolean member = filter.membershipTest(key);
- if ((keepMembers && member) || (!keepMembers && !member)) {
- return true;
- }
- }
- return false;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
deleted file mode 100644
index 4585a0a..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.lucene.TokenStreamIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class AnalyzerTransformer implements RegexTransformer {
-
- private Analyzer analyzer;
- private String fieldName = "text";
-
- private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class);
-
- public AnalyzerTransformer() {
- this(new StandardAnalyzer(), "text");
- }
-
- public AnalyzerTransformer(Analyzer analyzer) {
- this(analyzer, "text");
- }
-
- public AnalyzerTransformer(Analyzer analyzer, String fieldName) {
- this.analyzer = analyzer;
- this.fieldName = fieldName;
- }
-
- @Override
- public String transformMatch(String match) {
- StringBuilder result = new StringBuilder();
- try (TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(match))) {
- ts.addAttribute(CharTermAttribute.class);
- ts.reset();
- TokenStreamIterator iter = new TokenStreamIterator(ts);
- while (iter.hasNext()) {
- result.append(iter.next()).append(' ');
- }
- ts.end();
- } catch (IOException e) {
- throw new IllegalStateException(e);
- }
- return result.toString();
- }
-
- public Analyzer getAnalyzer() {
- return analyzer;
- }
-
- public void setAnalyzer(Analyzer analyzer) {
- this.analyzer = analyzer;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
deleted file mode 100644
index d3e8e06..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import com.google.common.collect.Lists;
-
-import java.util.List;
-
-/**
- * Chain together several {@link org.apache.mahout.utils.regex.RegexTransformer} and apply them to the match
- * in succession
- */
-public class ChainTransformer implements RegexTransformer {
-
- private List<RegexTransformer> chain = Lists.newArrayList();
-
- public ChainTransformer() {
- }
-
- public ChainTransformer(List<RegexTransformer> chain) {
- this.chain = chain;
- }
-
- @Override
- public String transformMatch(String match) {
- String result = match;
- for (RegexTransformer transformer : chain) {
- result = transformer.transformMatch(result);
- }
- return result;
- }
-
- public List<RegexTransformer> getChain() {
- return chain;
- }
-
- public void setChain(List<RegexTransformer> chain) {
- this.chain = chain;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java b/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
deleted file mode 100644
index a0f296d..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.util.regex.Pattern;
-
-/**
- * Collapses/converts all whitespace to a single tab
- */
-public class FPGFormatter implements RegexFormatter {
-
- private static final Pattern WHITESPACE = Pattern.compile("\\W+");
-
- @Override
- public String format(String toFormat) {
- return '\t' + WHITESPACE.matcher(toFormat).replaceAll("|");
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java b/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
deleted file mode 100644
index 5c1177c..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-public class IdentityFormatter implements RegexFormatter {
-
- @Override
- public String format(String toFormat) {
- return toFormat;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
deleted file mode 100644
index aea695d..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-/**
- * No-op
- */
-public final class IdentityTransformer implements RegexTransformer {
-
- @Override
- public String transformMatch(String match) {
- return match;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
deleted file mode 100644
index 53be239..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-
-/**
- * Experimental
- */
-public class RegexConverterDriver extends AbstractJob {
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.overwriteOption().create());
- addOption("regex", "regex",
- "The regular expression to use", true);
- addOption("groupsToKeep", "g",
- "The number of the capturing groups to keep", false);
- addOption("transformerClass", "t",
- "The optional class specifying the Regex Transformer", false);
- addOption("formatterClass", "t",
- "The optional class specifying the Regex Formatter", false);
- addOption(DefaultOptionCreator.analyzerOption().create());
-
- if (parseArguments(args) == null) {
- return -1;
- }
-
- Configuration conf = getConf();
- //TODO: How to deal with command line escaping?
- conf.set(RegexMapper.REGEX, getOption("regex")); //
- String gtk = getOption("groupsToKeep");
- if (gtk != null) {
- conf.set(RegexMapper.GROUP_MATCHERS, gtk);
- }
- String trans = getOption("transformerClass");
- if (trans != null) {
- if ("url".equalsIgnoreCase(trans)) {
- trans = URLDecodeTransformer.class.getName();
- }
- conf.set(RegexMapper.TRANSFORMER_CLASS, trans);
- }
- String formatter = getOption("formatterClass");
- if (formatter != null) {
- if ("fpg".equalsIgnoreCase(formatter)) {
- formatter = FPGFormatter.class.getName();
- }
- conf.set(RegexMapper.FORMATTER_CLASS, formatter);
- }
- Path input = getInputPath();
- Path output = getOutputPath();
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), output);
- }
- Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
- if (analyzerClass != null) {
- conf.set(RegexMapper.ANALYZER_NAME, analyzerClass.getName());
- }
- Job job = prepareJob(input, output,
- TextInputFormat.class,
- RegexMapper.class,
- LongWritable.class,
- Text.class,
- TextOutputFormat.class);
- boolean succeeded = job.waitForCompletion(true);
- return succeeded ? 0 : -1;
- }
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new RegexConverterDriver(), args);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
deleted file mode 100644
index 8ef837b..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-public interface RegexFormatter {
-
- String format(String toFormat);
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
deleted file mode 100644
index 04cacaa..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.mahout.common.ClassUtils;
-
-public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
-
- public static final String REGEX = "regex";
- public static final String GROUP_MATCHERS = "regex.groups";
- public static final String TRANSFORMER_CLASS = "transformer.class";
- public static final String FORMATTER_CLASS = "formatter.class";
-
- private Pattern regex;
- private List<Integer> groupsToKeep;
- private RegexTransformer transformer = RegexUtils.IDENTITY_TRANSFORMER;
- private RegexFormatter formatter = RegexUtils.IDENTITY_FORMATTER;
- public static final String ANALYZER_NAME = "analyzerName";
-
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- groupsToKeep = new ArrayList<>();
- Configuration config = context.getConfiguration();
- String regexStr = config.get(REGEX);
- regex = Pattern.compile(regexStr);
- String[] groups = config.getStrings(GROUP_MATCHERS);
- if (groups != null) {
- for (String group : groups) {
- groupsToKeep.add(Integer.parseInt(group));
- }
- }
-
- transformer = ClassUtils.instantiateAs(config.get(TRANSFORMER_CLASS, IdentityTransformer.class.getName()),
- RegexTransformer.class);
- String analyzerName = config.get(ANALYZER_NAME);
- if (analyzerName != null && transformer instanceof AnalyzerTransformer) {
- Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class);
- ((AnalyzerTransformer)transformer).setAnalyzer(analyzer);
- }
-
- formatter = ClassUtils.instantiateAs(config.get(FORMATTER_CLASS, IdentityFormatter.class.getName()),
- RegexFormatter.class);
- }
-
-
- @Override
- protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
- String result = RegexUtils.extract(text.toString(), regex, groupsToKeep, " ", transformer);
- if (!result.isEmpty()) {
- String format = formatter.format(result);
- context.write(key, new Text(format));
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
deleted file mode 100644
index adbc98f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-/**
- * Transforms the match of a regular expression.
- */
-public interface RegexTransformer {
-
- String transformMatch(String match);
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
deleted file mode 100644
index 5e32b99..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.util.Collection;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public final class RegexUtils {
-
- public static final RegexTransformer IDENTITY_TRANSFORMER = new IdentityTransformer();
- public static final RegexFormatter IDENTITY_FORMATTER = new IdentityFormatter();
-
- private RegexUtils() {
- }
-
- public static String extract(CharSequence line, Pattern pattern, Collection<Integer> groupsToKeep,
- String separator, RegexTransformer transformer) {
- StringBuilder bldr = new StringBuilder();
- extract(line, bldr, pattern, groupsToKeep, separator, transformer);
- return bldr.toString();
- }
-
- public static void extract(CharSequence line, StringBuilder outputBuffer,
- Pattern pattern, Collection<Integer> groupsToKeep, String separator,
- RegexTransformer transformer) {
- if (transformer == null) {
- transformer = IDENTITY_TRANSFORMER;
- }
- Matcher matcher = pattern.matcher(line);
- String match;
- if (groupsToKeep.isEmpty()) {
- while (matcher.find()) {
- match = matcher.group();
- if (match != null) {
- outputBuffer.append(transformer.transformMatch(match)).append(separator);
- }
- }
- } else {
- while (matcher.find()) {
- for (Integer groupNum : groupsToKeep) {
- match = matcher.group(groupNum);
- if (match != null) {
- outputBuffer.append(transformer.transformMatch(match)).append(separator);
- }
- }
- }
- }
- //trim off the last separator, which is always there
- if (outputBuffer.length() > 0) {
- outputBuffer.setLength(outputBuffer.length() - separator.length());
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
deleted file mode 100644
index 3eb7fc0..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-
-public final class URLDecodeTransformer implements RegexTransformer {
-
- private final String enc;
-
- public URLDecodeTransformer() {
- enc = "UTF-8";
- }
-
- public URLDecodeTransformer(String encoding) {
- this.enc = encoding;
- }
-
- @Override
- public String transformMatch(String match) {
- try {
- return URLDecoder.decode(match, enc);
- } catch (UnsupportedEncodingException e) {
- throw new IllegalStateException(e);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java b/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
deleted file mode 100644
index 13d61b8..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Converts a vector representation of documents into a {@code document x terms} matrix.
- * The input data is in {@code SequenceFile<Text,VectorWritable>} format (as generated by
- * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles SparseVectorsFromSequenceFiles}
- * or by {@link org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles EncodedVectorsFromSequenceFiles})
- * and generates the following two files as output:
- * <ul><li>A file called "matrix" of format {@code SequenceFile<IntWritable,VectorWritable>}.</li>
- * <li>A file called "docIndex" of format {@code SequenceFile<IntWritable,Text>}.</li></ul>
- * The input file can be regenerated by joining the two output files on the generated int key.
- * In other words, {@code RowIdJob} replaces the document text ids by integers.
- * The original document text ids can still be retrieved from the "docIndex".
- */
-public class RowIdJob extends AbstractJob {
- private static final Logger log = LoggerFactory.getLogger(RowIdJob.class);
-
- @Override
- public int run(String[] args) throws Exception {
-
- addInputOption();
- addOutputOption();
-
- Map<String, List<String>> parsedArgs = parseArguments(args);
- if (parsedArgs == null) {
- return -1;
- }
-
- Configuration conf = getConf();
- FileSystem fs = FileSystem.get(conf);
-
- Path outputPath = getOutputPath();
- Path indexPath = new Path(outputPath, "docIndex");
- Path matrixPath = new Path(outputPath, "matrix");
-
- try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath,
- IntWritable.class, Text.class);
- SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath, IntWritable.class,
- VectorWritable.class)) {
- IntWritable docId = new IntWritable();
- int i = 0;
- int numCols = 0;
- for (Pair<Text, VectorWritable> record
- : new SequenceFileDirIterable<Text, VectorWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(),
- null, true, conf)) {
- VectorWritable value = record.getSecond();
- docId.set(i);
- indexWriter.append(docId, record.getFirst());
- matrixWriter.append(docId, value);
- i++;
- numCols = value.get().size();
- }
-
- log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath);
- return 0;
- }
- }
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new RowIdJob(), args);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java b/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
deleted file mode 100644
index d74803f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-/**
- * Each entry in a {@link TermInfo} dictionary. Contains information about a term.
- */
-public class TermEntry {
-
- private final String term;
- private final int termIdx;
- private final int docFreq;
-
- public TermEntry(String term, int termIdx, int docFreq) {
- this.term = term;
- this.termIdx = termIdx;
- this.docFreq = docFreq;
- }
-
- public String getTerm() {
- return term;
- }
-
- public int getTermIdx() {
- return termIdx;
- }
-
- public int getDocFreq() {
- return docFreq;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
deleted file mode 100644
index 4fb36a3..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Iterator;
-
-/**
- * Contains the term dictionary information associated with a vectorized collection of text documents
- *
- */
-public interface TermInfo {
-
- int totalTerms(String field);
-
- TermEntry getTermEntry(String field, String term);
-
- Iterator<TermEntry> getAllEntries();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
deleted file mode 100644
index e1c3fbc..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
+++ /dev/null
@@ -1,266 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import com.google.common.collect.Sets;
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.Iterator;
-import java.util.Set;
-
-/**
- * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump
- * out the results using {@link Vector#asFormatString()} to either the console or to a
- * file.
- */
-public final class VectorDumper extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
-
- private VectorDumper() {
- }
-
- @Override
- public int run(String[] args) throws Exception {
- /**
- Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
- abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Sequence File containing the Vectors").withShortName("s").create();
- Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
- abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
- .withDescription("The directory containing Sequence File of Vectors")
- .withShortName("d").create();
- */
- addInputOption();
- addOutputOption();
- addOption("useKey", "u", "If the Key is a vector than dump that instead");
- addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
- addOption("dictionary", "d", "The dictionary file.", false);
- addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
- addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries");
- addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
- + "(if the vector is one) printing out the name");
- addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
- addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude "
- + "descending order");
- addOption("quiet", "q", "Print only file contents");
- addOption("sizeOnly", "sz", "Dump only the size of the vector");
- addOption("numItems", "ni", "Output at most <n> vecors", false);
- addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in"
- + " conjunction with -sort", false);
- addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter."
- + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null));
-
- if (parseArguments(args, false, true) == null) {
- return -1;
- }
-
- Path[] pathArr;
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- Path input = getInputPath();
- FileStatus fileStatus = fs.getFileStatus(input);
- if (fileStatus.isDir()) {
- pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
- } else {
- FileStatus[] inputPaths = fs.globStatus(input);
- pathArr = new Path[inputPaths.length];
- int i = 0;
- for (FileStatus fstatus : inputPaths) {
- pathArr[i++] = fstatus.getPath();
- }
- }
-
-
- String dictionaryType = getOption("dictionaryType", "text");
-
- boolean sortVectors = hasOption("sortVectors");
- boolean quiet = hasOption("quiet");
- if (!quiet) {
- log.info("Sort? {}", sortVectors);
- }
-
- String[] dictionary = null;
- if (hasOption("dictionary")) {
- String dictFile = getOption("dictionary");
- switch (dictionaryType) {
- case "text":
- dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
- break;
- case "sequencefile":
- dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
- break;
- default:
- //TODO: support Lucene's FST as a dictionary type
- throw new IOException("Invalid dictionary type: " + dictionaryType);
- }
- }
-
- Set<String> filters;
- if (hasOption("filter")) {
- filters = Sets.newHashSet(getOptions("filter"));
- } else {
- filters = null;
- }
-
- boolean useCSV = hasOption("csv");
-
- boolean sizeOnly = hasOption("sizeOnly");
- boolean nameOnly = hasOption("nameOnly");
- boolean namesAsComments = hasOption("namesAsComments");
- boolean transposeKeyValue = hasOption("vectorAsKey");
- Writer writer;
- boolean shouldClose;
- File output = getOutputFile();
- if (output != null) {
- shouldClose = true;
- log.info("Output file: {}", output);
- Files.createParentDirs(output);
- writer = Files.newWriter(output, Charsets.UTF_8);
- } else {
- shouldClose = false;
- writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
- }
- try {
- boolean printKey = hasOption("printKey");
- if (useCSV && dictionary != null) {
- writer.write("#");
- for (int j = 0; j < dictionary.length; j++) {
- writer.write(dictionary[j]);
- if (j < dictionary.length - 1) {
- writer.write(',');
- }
- }
- writer.write('\n');
- }
- Long numItems = null;
- if (hasOption("numItems")) {
- numItems = Long.parseLong(getOption("numItems"));
- if (quiet) {
- writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
- }
- }
- int maxIndexesPerVector = hasOption("vectorSize")
- ? Integer.parseInt(getOption("vectorSize"))
- : Integer.MAX_VALUE;
- long itemCount = 0;
- int fileCount = 0;
- for (Path path : pathArr) {
- if (numItems != null && numItems <= itemCount) {
- break;
- }
- if (quiet) {
- log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
- }
- SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf);
- Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
- long i = 0;
- while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
- Pair<Writable, Writable> record = iterator.next();
- Writable keyWritable = record.getFirst();
- Writable valueWritable = record.getSecond();
- if (printKey) {
- Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
- writer.write(notTheVectorWritable.toString());
- writer.write('\t');
- }
- Vector vector;
- try {
- vector = ((VectorWritable)
- (transposeKeyValue ? keyWritable : valueWritable)).get();
- } catch (ClassCastException e) {
- if ((transposeKeyValue ? keyWritable : valueWritable)
- instanceof WeightedPropertyVectorWritable) {
- vector =
- ((WeightedPropertyVectorWritable)
- (transposeKeyValue ? keyWritable : valueWritable)).getVector();
- } else {
- throw e;
- }
- }
- if (filters == null
- || !(vector instanceof NamedVector)
- || filters.contains(((NamedVector) vector).getName())) {
- if (sizeOnly) {
- if (vector instanceof NamedVector) {
- writer.write(((NamedVector) vector).getName());
- writer.write(":");
- } else {
- writer.write(String.valueOf(i++));
- writer.write(":");
- }
- writer.write(String.valueOf(vector.size()));
- writer.write('\n');
- } else if (nameOnly) {
- if (vector instanceof NamedVector) {
- writer.write(((NamedVector) vector).getName());
- writer.write('\n');
- }
- } else {
- String fmtStr;
- if (useCSV) {
- fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
- } else {
- fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
- sortVectors);
- }
- writer.write(fmtStr);
- writer.write('\n');
- }
- itemCount++;
- }
- }
- }
- writer.flush();
- } finally {
- if (shouldClose) {
- Closeables.close(writer, false);
- }
- }
-
- return 0;
- }
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new Configuration(), new VectorDumper(), args);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
deleted file mode 100644
index 66c3fb6..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
+++ /dev/null
@@ -1,256 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Collections2;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.util.PriorityQueue;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.Vector.Element;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.regex.Pattern;
-
-/** Static utility methods related to vectors. */
-public final class VectorHelper {
-
- private static final Pattern TAB_PATTERN = Pattern.compile("\t");
-
-
- private VectorHelper() {
- }
-
- public static String vectorToCSVString(Vector vector, boolean namesAsComments) throws IOException {
- Appendable bldr = new StringBuilder(2048);
- vectorToCSVString(vector, namesAsComments, bldr);
- return bldr.toString();
- }
-
- public static String buildJson(Iterable<Pair<String, Double>> iterable) {
- return buildJson(iterable, new StringBuilder(2048));
- }
-
- public static String buildJson(Iterable<Pair<String, Double>> iterable, StringBuilder bldr) {
- bldr.append('{');
- for (Pair<String, Double> p : iterable) {
- bldr.append(p.getFirst());
- bldr.append(':');
- bldr.append(p.getSecond());
- bldr.append(',');
- }
- if (bldr.length() > 1) {
- bldr.setCharAt(bldr.length() - 1, '}');
- }
- return bldr.toString();
- }
-
- public static List<Pair<Integer, Double>> topEntries(Vector vector, int maxEntries) {
-
- // Get the size of nonZero elements in the input vector
- int sizeOfNonZeroElementsInVector = vector.getNumNonZeroElements();
-
- // If the sizeOfNonZeroElementsInVector < maxEntries then set maxEntries = sizeOfNonZeroElementsInVector
- // otherwise the call to queue.pop() returns a Pair(null, null) and the subsequent call
- // to pair.getFirst() throws a NullPointerException
- if (sizeOfNonZeroElementsInVector < maxEntries) {
- maxEntries = sizeOfNonZeroElementsInVector;
- }
-
- PriorityQueue<Pair<Integer, Double>> queue = new TDoublePQ<>(-1, maxEntries);
- for (Element e : vector.nonZeroes()) {
- queue.insertWithOverflow(Pair.of(e.index(), e.get()));
- }
- List<Pair<Integer, Double>> entries = new ArrayList<>();
- Pair<Integer, Double> pair;
- while ((pair = queue.pop()) != null) {
- if (pair.getFirst() > -1) {
- entries.add(pair);
- }
- }
- Collections.sort(entries, new Comparator<Pair<Integer, Double>>() {
- @Override
- public int compare(Pair<Integer, Double> a, Pair<Integer, Double> b) {
- return b.getSecond().compareTo(a.getSecond());
- }
- });
- return entries;
- }
-
- public static List<Pair<Integer, Double>> firstEntries(Vector vector, int maxEntries) {
- List<Pair<Integer, Double>> entries = new ArrayList<>();
- Iterator<Vector.Element> it = vector.nonZeroes().iterator();
- int i = 0;
- while (it.hasNext() && i++ < maxEntries) {
- Vector.Element e = it.next();
- entries.add(Pair.of(e.index(), e.get()));
- }
- return entries;
- }
-
- public static List<Pair<String, Double>> toWeightedTerms(Collection<Pair<Integer, Double>> entries,
- final String[] dictionary) {
- if (dictionary != null) {
- return new ArrayList<>(Collections2.transform(entries,
- new Function<Pair<Integer, Double>, Pair<String, Double>>() {
- @Override
- public Pair<String, Double> apply(Pair<Integer, Double> p) {
- return Pair.of(dictionary[p.getFirst()], p.getSecond());
- }
- }));
- } else {
- return new ArrayList<>(Collections2.transform(entries,
- new Function<Pair<Integer, Double>, Pair<String, Double>>() {
- @Override
- public Pair<String, Double> apply(Pair<Integer, Double> p) {
- return Pair.of(Integer.toString(p.getFirst()), p.getSecond());
- }
- }));
- }
- }
-
- public static String vectorToJson(Vector vector, String[] dictionary, int maxEntries, boolean sort) {
- return buildJson(toWeightedTerms(sort
- ? topEntries(vector, maxEntries)
- : firstEntries(vector, maxEntries), dictionary));
- }
-
- public static void vectorToCSVString(Vector vector,
- boolean namesAsComments,
- Appendable bldr) throws IOException {
- if (namesAsComments && vector instanceof NamedVector) {
- bldr.append('#').append(((NamedVector) vector).getName()).append('\n');
- }
- Iterator<Vector.Element> iter = vector.all().iterator();
- boolean first = true;
- while (iter.hasNext()) {
- if (first) {
- first = false;
- } else {
- bldr.append(',');
- }
- Vector.Element elt = iter.next();
- bldr.append(String.valueOf(elt.get()));
- }
- bldr.append('\n');
- }
-
- /**
- * Read in a dictionary file. Format is:
- * 
- * <pre>
- * term DocFreq Index
- * </pre>
- */
- public static String[] loadTermDictionary(File dictFile) throws IOException {
- try (InputStream in = new FileInputStream(dictFile)) {
- return loadTermDictionary(in);
- }
- }
-
- /**
- * Read a dictionary in {@link org.apache.hadoop.io.SequenceFile} generated by
- * {@link org.apache.mahout.vectorizer.DictionaryVectorizer}
- *
- * @param filePattern <PATH TO DICTIONARY>/dictionary.file-*
- */
- public static String[] loadTermDictionary(Configuration conf, String filePattern) {
- OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<>();
- int maxIndexValue = 0;
- for (Pair<Text, IntWritable> record
- : new SequenceFileDirIterable<Text, IntWritable>(new Path(filePattern), PathType.GLOB, null, null, true,
- conf)) {
- dict.put(record.getFirst().toString(), record.getSecond().get());
- if (record.getSecond().get() > maxIndexValue) {
- maxIndexValue = record.getSecond().get();
- }
- }
- // Set dictionary size to greater of (maxIndexValue + 1, dict.size())
- int maxDictionarySize = maxIndexValue + 1 > dict.size() ? maxIndexValue + 1 : dict.size();
- String[] dictionary = new String[maxDictionarySize];
- for (String feature : dict.keys()) {
- dictionary[dict.get(feature)] = feature;
- }
- return dictionary;
- }
-
- /**
- * Read in a dictionary file. Format is: First line is the number of entries
- * 
- * <pre>
- * term DocFreq Index
- * </pre>
- */
- private static String[] loadTermDictionary(InputStream is) throws IOException {
- FileLineIterator it = new FileLineIterator(is);
-
- int numEntries = Integer.parseInt(it.next());
- String[] result = new String[numEntries];
-
- while (it.hasNext()) {
- String line = it.next();
- if (line.startsWith("#")) {
- continue;
- }
- String[] tokens = TAB_PATTERN.split(line);
- if (tokens.length < 3) {
- continue;
- }
- int index = Integer.parseInt(tokens[2]); // tokens[1] is the doc freq
- result[index] = tokens[0];
- }
- return result;
- }
-
- private static final class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> {
- private final T sentinel;
-
- private TDoublePQ(T sentinel, int size) {
- super(size);
- this.sentinel = sentinel;
- }
-
- @Override
- protected boolean lessThan(Pair<T, Double> a, Pair<T, Double> b) {
- return a.getSecond().compareTo(b.getSecond()) < 0;
- }
-
- @Override
- protected Pair<T, Double> getSentinelObject() {
- return Pair.of(sentinel, Double.NEGATIVE_INFINITY);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
deleted file mode 100644
index f2632a4..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Closeables;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-final class ARFFIterator extends AbstractIterator<Vector> {
-
- // This pattern will make sure a , inside a string is not a point for split.
- // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string
- private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*");
- private static final Pattern DATA_PATTERN = Pattern.compile("^\\"+ARFFModel.ARFF_SPARSE+"(.*)\\"+ARFFModel.ARFF_SPARSE_END+"$");
-
- private final BufferedReader reader;
- private final ARFFModel model;
-
- ARFFIterator(BufferedReader reader, ARFFModel model) {
- this.reader = reader;
- this.model = model;
- }
-
- @Override
- protected Vector computeNext() {
- String line;
- try {
- while ((line = reader.readLine()) != null) {
- line = line.trim();
- if (!line.isEmpty() && !line.startsWith(ARFFModel.ARFF_COMMENT)) {
- break;
- }
- }
- } catch (IOException ioe) {
- throw new IllegalStateException(ioe);
- }
- if (line == null) {
- try {
- Closeables.close(reader, true);
- } catch (IOException e) {
- throw new IllegalStateException(e);
- }
- return endOfData();
- }
- Vector result;
- Matcher contents = DATA_PATTERN.matcher(line);
- if (contents.find()) {
- line = contents.group(1);
- String[] splits = splitCSV(line);
- result = new RandomAccessSparseVector(model.getLabelSize());
- for (String split : splits) {
- int idIndex = split.indexOf(' ');
- int idx = Integer.parseInt(split.substring(0, idIndex).trim());
- String data = split.substring(idIndex).trim();
- if (!"?".equals(data)) {
- result.setQuick(idx, model.getValue(data, idx));
- }
- }
- } else {
- result = new DenseVector(model.getLabelSize());
- String[] splits = splitCSV(line);
- for (int i = 0; i < splits.length; i++) {
- String split = splits[i];
- split = split.trim();
- if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && !"?".equals(split)) {
- result.setQuick(i, model.getValue(split, i));
- }
- }
- }
- return result;
- }
-
- /**
- * Splits a string by comma, ignores commas inside quotes and escaped quotes.
- * As quotes are both double and single possible, because there is no exact definition
- * for ARFF files
- * @param line -
- * @return String[]
- */
- public static String[] splitCSV(String line) {
- StringBuilder sb = new StringBuilder(128);
- List<String> tokens = new ArrayList<>();
- char escapeChar = '\0';
- for (int i = 0; i < line.length(); i++) {
- char c = line.charAt(i);
- if (c == '\\') {
- i++;
- sb.append(line.charAt(i));
- }
- else if (c == '"' || c == '\'') {
- // token is closed
- if (c == escapeChar) {
- escapeChar = '\0';
- }
- else if (escapeChar == '\0') {
- escapeChar = c;
- }
- sb.append(c);
- }
- else if (c == ',') {
- if (escapeChar == '\0') {
- tokens.add(sb.toString().trim());
- sb.setLength(0); // start work on next token
- }
- else {
- sb.append(c);
- }
- }
- else {
- sb.append(c);
- }
- }
- if (sb.length() > 0) {
- tokens.add(sb.toString().trim());
- }
-
- return tokens.toArray(new String[tokens.size()]);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
deleted file mode 100644
index fc86997..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.text.DateFormat;
-import java.util.Map;
-
-/**
- * An interface for representing an ARFFModel. Implementations can decide on the best approach
- * for storing the model, as some approaches will be fine for smaller files, while larger
- * ones may require a better implementation.
- */
-public interface ARFFModel {
- String ARFF_SPARSE = "{"; //indicates the vector is sparse
- String ARFF_SPARSE_END = "}";
- String ARFF_COMMENT = "%";
- String ATTRIBUTE = "@attribute";
- String DATA = "@data";
- String RELATION = "@relation";
-
-
- String getRelation();
-
- void setRelation(String relation);
-
- /**
- * The vector attributes (labels in Mahout speak)
- * @return the map
- */
- Map<String, Integer> getLabelBindings();
-
- Integer getNominalValue(String label, String nominal);
-
- void addNominal(String label, String nominal, int idx);
-
- DateFormat getDateFormat(Integer idx);
-
- void addDateFormat(Integer idx, DateFormat format);
-
- Integer getLabelIndex(String label);
-
- void addLabel(String label, Integer idx);
-
- ARFFType getARFFType(Integer idx);
-
- void addType(Integer idx, ARFFType type);
-
- /**
- * The count of the number of words seen
- * @return the count
- */
- long getWordCount();
-
- double getValue(String data, int idx);
-
- Map<String, Map<String, Integer>> getNominalMap();
-
- int getLabelSize();
-
- Map<String, Long> getWords();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
deleted file mode 100644
index 9ba7c31..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-public enum ARFFType {
-
- NUMERIC("numeric"),
- INTEGER("integer"),
- REAL("real"),
- NOMINAL("{"),
- DATE("date"),
- STRING("string");
-
- private final String indicator;
-
- ARFFType(String indicator) {
- this.indicator = indicator;
- }
-
- public String getIndicator() {
- return indicator;
- }
-
- public String getLabel(String line) {
- int idx = line.lastIndexOf(indicator);
- return removeQuotes(line.substring(ARFFModel.ATTRIBUTE.length(), idx));
- }
-
- /**
- * Remove quotes and leading/trailing whitespace from a single or double quoted string
- * @param str quotes from
- * @return A string without quotes
- */
- public static String removeQuotes(String str) {
- String cleaned = str;
- if (cleaned != null) {
- cleaned = cleaned.trim();
- boolean isQuoted = cleaned.length() > 1
- && (cleaned.startsWith("\"") && cleaned.endsWith("\"")
- || cleaned.startsWith("'") && cleaned.endsWith("'"));
- if (isQuoted) {
- cleaned = cleaned.substring(1, cleaned.length() - 1);
- }
- }
- return cleaned;
- }
-}

r***@apache.org

2018-06-27 14:52:03 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/pom.xml
----------------------------------------------------------------------
diff --git a/integration/pom.xml b/integration/pom.xml
deleted file mode 100644
index 5a873a6..0000000
--- a/integration/pom.xml
+++ /dev/null
@@ -1,198 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout</artifactId>
- <version>0.13.1-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>mahout-integration</artifactId>
- <name>Mahout Integration</name>
- <description>Optional components of Mahout which generally support interaction with third party systems,
- formats, APIs, etc.</description>
-
- <packaging>jar</packaging>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-remote-resources-plugin</artifactId>
- <configuration>
- <appendedResourcesDirectory>../src/main/appended-resources</appendedResourcesDirectory>
- <resourceBundles>
- <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
- </resourceBundles>
- <supplementalModels>
- <supplementalModel>supplemental-models.xml</supplementalModel>
- </supplementalModels>
- </configuration>
- </plugin>
-
- <plugin>
- <artifactId>maven-javadoc-plugin</artifactId>
- </plugin>
-
- <plugin>
- <artifactId>maven-source-plugin</artifactId>
- </plugin>
-
- </plugins>
-
- </build>
-
- <dependencies>
-
- 
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-hdfs</artifactId>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-mr</artifactId>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-hdfs</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-mr</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-math</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
-
- 
-
- <dependency>
- <groupId>commons-dbcp</groupId>
- <artifactId>commons-dbcp</artifactId>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>commons-pool</groupId>
- <artifactId>commons-pool</artifactId>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-commons-csv</artifactId>
- <version>3.5.0</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-benchmark</artifactId>
- <optional>true</optional>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>org.mongodb</groupId>
- <artifactId>mongo-java-driver</artifactId>
- <version>2.11.2</version>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>org.mongodb</groupId>
- <artifactId>bson</artifactId>
- <version>2.11.2</version>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.hectorclient</groupId>
- <artifactId>hector-core</artifactId>
- <version>1.1-4</version>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jcl</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>com.carrotsearch.randomizedtesting</groupId>
- <artifactId>randomizedtesting-runner</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.easymock</groupId>
- <artifactId>easymock</artifactId>
- <scope>test</scope>
- </dependency>
-
- </dependencies>
-
-</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java b/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
deleted file mode 100644
index 549cf2c..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import java.util.Random;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.TimingStatistics;
-import org.apache.mahout.math.Vector;
-
-import com.google.common.base.Function;
-
-public final class BenchmarkRunner {
- private static final int BUCKET_SIZE = 10000;
- private static final Random R = RandomUtils.getRandom();
- private final long maxTimeUsec;
- private final long leadTimeUsec;
-
- public BenchmarkRunner(long leadTimeMs, long maxTimeMs) {
- maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(maxTimeMs);
- leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(leadTimeMs);
- }
-
- public abstract static class BenchmarkFn implements Function<Integer, Boolean> {
- protected int randIndex() {
- return BenchmarkRunner.randIndex();
- }
-
- protected boolean randBool() {
- return BenchmarkRunner.randBool();
- }
-
- /**
- * Adds a random data dependency so that JVM does not remove dead code.
- */
- protected boolean depends(Vector v) {
- return randIndex() < v.getNumNondefaultElements();
- }
- }
-
- public abstract static class BenchmarkFnD implements Function<Integer, Double> {
- protected int randIndex() {
- return BenchmarkRunner.randIndex();
- }
-
- protected boolean randBool() {
- return BenchmarkRunner.randBool();
- }
-
- /**
- * Adds a random data dependency so that JVM does not remove dead code.
- */
- protected boolean depends(Vector v) {
- return randIndex() < v.getNumNondefaultElements();
- }
- }
-
- private static int randIndex() {
- return R.nextInt(BUCKET_SIZE);
- }
-
- private static boolean randBool() {
- return R.nextBoolean();
- }
-
- public TimingStatistics benchmark(BenchmarkFn function) {
- TimingStatistics stats = new TimingStatistics();
- boolean result = false;
- while (true) {
- int i = R.nextInt(BUCKET_SIZE);
- TimingStatistics.Call call = stats.newCall(leadTimeUsec);
- result = result ^ function.apply(i);
- if (call.end(maxTimeUsec)) {
- break;
- }
- }
- return stats;
- }
-
- public TimingStatistics benchmarkD(BenchmarkFnD function) {
- TimingStatistics stats = new TimingStatistics();
- double result = 0;
- while (true) {
- int i = R.nextInt(BUCKET_SIZE);
- TimingStatistics.Call call = stats.newCall(leadTimeUsec);
- result += function.apply(i);
- if (call.end(maxTimeUsec)) {
- break;
- }
- }
- // print result to prevent hotspot from eliminating deadcode
- System.err.println("Result = " + result);
- return stats;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
deleted file mode 100644
index 5e6ab4d..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
-
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
-
-public class CloneBenchmark {
- public static final String CLONE = "Clone";
- private final VectorBenchmarks mark;
-
- public CloneBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark() {
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- mark.vectors[0][mark.vIndex(i)] = mark.vectors[0][mark.vIndex(i)].clone();
-
- return depends(mark.vectors[0][mark.vIndex(i)]);
- }
- }), CLONE, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- mark.vectors[1][mark.vIndex(i)] = mark.vectors[1][mark.vIndex(i)].clone();
-
- return depends(mark.vectors[1][mark.vIndex(i)]);
- }
- }), CLONE, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- mark.vectors[2][mark.vIndex(i)] = mark.vectors[2][mark.vIndex(i)].clone();
-
- return depends(mark.vectors[2][mark.vIndex(i)]);
- }
- }), CLONE, SEQ_SPARSE_VECTOR);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
deleted file mode 100644
index b1c2ded..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import java.io.IOException;
-import java.util.Random;
-
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.TimingStatistics;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.SparseMatrix;
-import org.apache.mahout.math.Vector;
-
-public class ClosestCentroidBenchmark {
- private final VectorBenchmarks mark;
-
- public ClosestCentroidBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark(DistanceMeasure measure) throws IOException {
- SparseMatrix clusterDistances = new SparseMatrix(mark.numClusters, mark.numClusters);
- for (int i = 0; i < mark.numClusters; i++) {
- for (int j = 0; j < mark.numClusters; j++) {
- double distance = Double.POSITIVE_INFINITY;
- if (i != j) {
- distance = measure.distance(mark.clusters[i], mark.clusters[j]);
- }
- clusterDistances.setQuick(i, j, distance);
- }
- }
-
- long distanceCalculations = 0;
- TimingStatistics stats = new TimingStatistics();
- for (int l = 0; l < mark.loop; l++) {
- TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
- for (int i = 0; i < mark.numVectors; i++) {
- Vector vector = mark.vectors[1][mark.vIndex(i)];
- double minDistance = Double.MAX_VALUE;
- for (int k = 0; k < mark.numClusters; k++) {
- double distance = measure.distance(vector, mark.clusters[k]);
- distanceCalculations++;
- if (distance < minDistance) {
- minDistance = distance;
- }
- }
- }
- if (call.end(mark.maxTimeUsec)) {
- break;
- }
- }
- mark.printStats(stats, measure.getClass().getName(), "Closest C w/o Elkan's trick", "distanceCalculations = "
- + distanceCalculations);
-
- distanceCalculations = 0;
- stats = new TimingStatistics();
- Random rand = RandomUtils.getRandom();
- for (int l = 0; l < mark.loop; l++) {
- TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
- for (int i = 0; i < mark.numVectors; i++) {
- Vector vector = mark.vectors[1][mark.vIndex(i)];
- int closestCentroid = rand.nextInt(mark.numClusters);
- double dist = measure.distance(vector, mark.clusters[closestCentroid]);
- distanceCalculations++;
- for (int k = 0; k < mark.numClusters; k++) {
- if (closestCentroid != k) {
- double centroidDist = clusterDistances.getQuick(k, closestCentroid);
- if (centroidDist < 2 * dist) {
- dist = measure.distance(vector, mark.clusters[k]);
- closestCentroid = k;
- distanceCalculations++;
- }
- }
- }
- }
- if (call.end(mark.maxTimeUsec)) {
- break;
- }
- }
- mark.printStats(stats, measure.getClass().getName(), "Closest C w/ Elkan's trick", "distanceCalculations = "
- + distanceCalculations);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
deleted file mode 100644
index 25d0ad7..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
-
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD;
-import org.apache.mahout.common.distance.DistanceMeasure;
-
-public class DistanceBenchmark {
- private final VectorBenchmarks mark;
-
- public DistanceBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark(final DistanceMeasure measure) {
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), SEQ_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), DENSE_FN_RAND);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), DENSE_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), RAND_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), RAND_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), SEQ_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
- }
- }), measure.getClass().getName(), SEQ_FN_RAND);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
deleted file mode 100644
index fc7f911..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
-
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD;
-
-public class DotBenchmark {
- private static final String DOT_PRODUCT = "DotProduct";
- private static final String NORM1 = "Norm1";
- private static final String NORM2 = "Norm2";
- private static final String LOG_NORMALIZE = "LogNormalize";
- private final VectorBenchmarks mark;
-
- public DotBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark() {
- benchmarkDot();
- benchmarkNorm1();
- benchmarkNorm2();
- benchmarkLogNormalize();
- }
-
- private void benchmarkLogNormalize() {
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- return depends(mark.vectors[0][mark.vIndex(i)].logNormalize());
- }
- }), LOG_NORMALIZE, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- return depends(mark.vectors[1][mark.vIndex(i)].logNormalize());
- }
- }), LOG_NORMALIZE, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- return depends(mark.vectors[2][mark.vIndex(i)].logNormalize());
- }
- }), LOG_NORMALIZE, SEQ_SPARSE_VECTOR);
- }
-
- private void benchmarkNorm1() {
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[0][mark.vIndex(i)].norm(1);
- }
- }), NORM1, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[1][mark.vIndex(i)].norm(1);
- }
- }), NORM1, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[2][mark.vIndex(i)].norm(1);
- }
- }), NORM1, SEQ_SPARSE_VECTOR);
- }
-
- private void benchmarkNorm2() {
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[0][mark.vIndex(i)].norm(2);
- }
- }), NORM2, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[1][mark.vIndex(i)].norm(2);
- }
- }), NORM2, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[2][mark.vIndex(i)].norm(2);
- }
- }), NORM2, SEQ_SPARSE_VECTOR);
- }
-
- private void benchmarkDot() {
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, SEQ_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, DENSE_FN_RAND);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, DENSE_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, RAND_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, RAND_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, SEQ_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
- @Override
- public Double apply(Integer i) {
- return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
- }
- }), DOT_PRODUCT, SEQ_FN_RAND);
- }
-
- public static void main(String[] args) {
- VectorBenchmarks mark = new VectorBenchmarks(1000000, 100, 1000, 10, 1);
- mark.createData();
- new DotBenchmark(mark).benchmarkNorm2();
- System.out.println(mark);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
deleted file mode 100644
index 82fb693..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
-
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
-import org.apache.mahout.math.Vector;
-
-public class MinusBenchmark {
-
- private static final String MINUS = "Minus";
- private final VectorBenchmarks mark;
-
- public MinusBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark() {
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, SEQ_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, DENSE_FN_RAND);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, DENSE_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, RAND_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, RAND_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, SEQ_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), MINUS, SEQ_FN_RAND);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
deleted file mode 100644
index bd76e94..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
-
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
-import org.apache.mahout.math.Vector;
-
-public class PlusBenchmark {
-
- private static final String PLUS = "Plus";
- private final VectorBenchmarks mark;
-
- public PlusBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark() {
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, SEQ_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, DENSE_FN_RAND);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, DENSE_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, RAND_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, RAND_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, SEQ_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), PLUS, SEQ_FN_RAND);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
deleted file mode 100644
index cd403c2..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Writable;
-import org.apache.mahout.common.TimingStatistics;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
-import org.apache.mahout.math.VectorWritable;
-
-import java.io.IOException;
-
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
-
-public class SerializationBenchmark {
- public static final String SERIALIZE = "Serialize";
- public static final String DESERIALIZE = "Deserialize";
- private final VectorBenchmarks mark;
-
- public SerializationBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark() throws IOException {
- serializeBenchmark();
- deserializeBenchmark();
- }
-
- public void serializeBenchmark() throws IOException {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
-
- Writable one = new IntWritable(0);
- VectorWritable vec = new VectorWritable();
- TimingStatistics stats = new TimingStatistics();
-
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf, new Path("/tmp/dense-vector"),
- IntWritable.class, VectorWritable.class)){
- for (int i = 0; i < mark.loop; i++) {
- TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
- vec.set(mark.vectors[0][mark.vIndex(i)]);
- writer.append(one, vec);
- if (call.end(mark.maxTimeUsec)) {
- break;
- }
- }
- }
- mark.printStats(stats, SERIALIZE, DENSE_VECTOR);
-
- stats = new TimingStatistics();
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf,
- new Path("/tmp/randsparse-vector"), IntWritable.class, VectorWritable.class)){
- for (int i = 0; i < mark.loop; i++) {
- TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
- vec.set(mark.vectors[1][mark.vIndex(i)]);
- writer.append(one, vec);
- if (call.end(mark.maxTimeUsec)) {
- break;
- }
- }
- }
- mark.printStats(stats, SERIALIZE, RAND_SPARSE_VECTOR);
-
- stats = new TimingStatistics();
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf,
- new Path("/tmp/seqsparse-vector"), IntWritable.class, VectorWritable.class)) {
- for (int i = 0; i < mark.loop; i++) {
- TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
- vec.set(mark.vectors[2][mark.vIndex(i)]);
- writer.append(one, vec);
- if (call.end(mark.maxTimeUsec)) {
- break;
- }
- }
- }
- mark.printStats(stats, SERIALIZE, SEQ_SPARSE_VECTOR);
-
- }
-
- public void deserializeBenchmark() throws IOException {
- doDeserializeBenchmark(DENSE_VECTOR, "/tmp/dense-vector");
- doDeserializeBenchmark(RAND_SPARSE_VECTOR, "/tmp/randsparse-vector");
- doDeserializeBenchmark(SEQ_SPARSE_VECTOR, "/tmp/seqsparse-vector");
- }
-
- private void doDeserializeBenchmark(String name, String pathString) throws IOException {
- TimingStatistics stats = new TimingStatistics();
- TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
- SequenceFileValueIterator<Writable> iterator = new SequenceFileValueIterator<>(new Path(pathString), true,
- new Configuration());
- while (iterator.hasNext()) {
- iterator.next();
- call.end();
- call = stats.newCall(mark.leadTimeUsec);
- }
- iterator.close();
- mark.printStats(stats, DESERIALIZE, name);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
deleted file mode 100644
index bf81228..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
-import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
-import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
-
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
-import org.apache.mahout.math.Vector;
-
-public class TimesBenchmark {
-
- private static final String TIMES = "Times";
- private final VectorBenchmarks mark;
-
- public TimesBenchmark(VectorBenchmarks mark) {
- this.mark = mark;
- }
-
- public void benchmark() {
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, DENSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, RAND_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, SEQ_SPARSE_VECTOR);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, DENSE_FN_RAND);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, DENSE_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, RAND_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, RAND_FN_SEQ);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, SEQ_FN_DENSE);
-
- mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
- return depends(v);
- }
- }), TIMES, SEQ_FN_RAND);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java b/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
deleted file mode 100644
index a076322..0000000
--- a/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
+++ /dev/null
@@ -1,497 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.benchmark;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.TimingStatistics;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.ChebyshevDistanceMeasure;
-import org.apache.mahout.common.distance.CosineDistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.common.distance.MinkowskiDistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.TanimotoDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.text.DecimalFormat;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Random;
-import java.util.concurrent.TimeUnit;
-import java.util.regex.Pattern;
-
-public class VectorBenchmarks {
- private static final int MAX_TIME_MS = 5000;
- private static final int LEAD_TIME_MS = 15000;
- public static final String CLUSTERS = "Clusters";
- public static final String CREATE_INCREMENTALLY = "Create (incrementally)";
- public static final String CREATE_COPY = "Create (copy)";
-
- public static final String DENSE_FN_SEQ = "Dense.fn(Seq)";
- public static final String RAND_FN_DENSE = "Rand.fn(Dense)";
- public static final String SEQ_FN_RAND = "Seq.fn(Rand)";
- public static final String RAND_FN_SEQ = "Rand.fn(Seq)";
- public static final String SEQ_FN_DENSE = "Seq.fn(Dense)";
- public static final String DENSE_FN_RAND = "Dense.fn(Rand)";
- public static final String SEQ_SPARSE_VECTOR = "SeqSparseVector";
- public static final String RAND_SPARSE_VECTOR = "RandSparseVector";
- public static final String DENSE_VECTOR = "DenseVector";
-
- private static final Logger log = LoggerFactory.getLogger(VectorBenchmarks.class);
- private static final Pattern TAB_NEWLINE_PATTERN = Pattern.compile("[\n\t]");
- private static final String[] EMPTY = new String[0];
- private static final DecimalFormat DF = new DecimalFormat("#.##");
-
- /* package private */
- final Vector[][] vectors;
- final Vector[] clusters;
- final int cardinality;
- final int numNonZeros;
- final int numVectors;
- final int numClusters;
- final int loop = Integer.MAX_VALUE;
- final int opsPerUnit;
- final long maxTimeUsec;
- final long leadTimeUsec;
-
- private final List<Vector> randomVectors = new ArrayList<>();
- private final List<int[]> randomVectorIndices = new ArrayList<>();
- private final List<double[]> randomVectorValues = new ArrayList<>();
- private final Map<String, Integer> implType = new HashMap<>();
- private final Map<String, List<String[]>> statsMap = new HashMap<>();
- private final BenchmarkRunner runner;
- private final Random r = RandomUtils.getRandom();
-
- public VectorBenchmarks(int cardinality, int numNonZeros, int numVectors, int numClusters,
- int opsPerUnit) {
- runner = new BenchmarkRunner(LEAD_TIME_MS, MAX_TIME_MS);
- maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(MAX_TIME_MS);
- leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(LEAD_TIME_MS);
-
- this.cardinality = cardinality;
- this.numNonZeros = numNonZeros;
- this.numVectors = numVectors;
- this.numClusters = numClusters;
- this.opsPerUnit = opsPerUnit;
-
- setUpVectors(cardinality, numNonZeros, numVectors);
-
- vectors = new Vector[3][numVectors];
- clusters = new Vector[numClusters];
- }
-
- private void setUpVectors(int cardinality, int numNonZeros, int numVectors) {
- for (int i = 0; i < numVectors; i++) {
- Vector v = new SequentialAccessSparseVector(cardinality, numNonZeros); // sparsity!
- BitSet featureSpace = new BitSet(cardinality);
- int[] indexes = new int[numNonZeros];
- double[] values = new double[numNonZeros];
- int j = 0;
- while (j < numNonZeros) {
- double value = r.nextGaussian();
- int index = r.nextInt(cardinality);
- if (!featureSpace.get(index) && value != 0) {
- featureSpace.set(index);
- indexes[j] = index;
- values[j++] = value;
- v.set(index, value);
- }
- }
- randomVectorIndices.add(indexes);
- randomVectorValues.add(values);
- randomVectors.add(v);
- }
- }
-
- void printStats(TimingStatistics stats, String benchmarkName, String implName, String content) {
- printStats(stats, benchmarkName, implName, content, 1);
- }
-
- void printStats(TimingStatistics stats, String benchmarkName, String implName) {
- printStats(stats, benchmarkName, implName, "", 1);
- }
-
- private void printStats(TimingStatistics stats, String benchmarkName, String implName,
- String content, int multiplier) {
- float speed = multiplier * stats.getNCalls() * (numNonZeros * 1000.0f * 12 / stats.getSumTime());
- float opsPerSec = stats.getNCalls() * 1000000000.0f / stats.getSumTime();
- log.info("{} {} \n{} {} \nOps = {} Units/sec\nIOps = {} MBytes/sec", benchmarkName,
- implName, content, stats.toString(), DF.format(opsPerSec), DF.format(speed));
-
- if (!implType.containsKey(implName)) {
- implType.put(implName, implType.size());
- }
- int implId = implType.get(implName);
- if (!statsMap.containsKey(benchmarkName)) {
- statsMap.put(benchmarkName, new ArrayList<String[]>());
- }
- List<String[]> implStats = statsMap.get(benchmarkName);
- while (implStats.size() < implId + 1) {
- implStats.add(EMPTY);
- }
- implStats.set(
- implId,
- TAB_NEWLINE_PATTERN.split(stats + "\tSpeed = " + DF.format(opsPerSec) + " /sec\tRate = "
- + DF.format(speed) + " MB/s"));
- }
-
- public void createData() {
- for (int i = 0; i < Math.max(numVectors, numClusters); ++i) {
- vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i)));
- vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
- vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i)));
- if (numClusters > 0) {
- clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
- }
- }
- }
-
- public void createBenchmark() {
- printStats(runner.benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i)));
- return depends(vectors[0][vIndex(i)]);
- }
- }), CREATE_COPY, DENSE_VECTOR);
-
- printStats(runner.benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
- return depends(vectors[1][vIndex(i)]);
- }
- }), CREATE_COPY, RAND_SPARSE_VECTOR);
-
- printStats(runner.benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i)));
- return depends(vectors[2][vIndex(i)]);
- }
- }), CREATE_COPY, SEQ_SPARSE_VECTOR);
-
- if (numClusters > 0) {
- printStats(runner.benchmark(new BenchmarkFn() {
- @Override
- public Boolean apply(Integer i) {
- clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
- return depends(clusters[cIndex(i)]);
- }
- }), CREATE_COPY, CLUSTERS);
- }
- }
-
- private boolean buildVectorIncrementally(TimingStatistics stats, int randomIndex, Vector v, boolean useSetQuick) {
- int[] indexes = randomVectorIndices.get(randomIndex);
- double[] values = randomVectorValues.get(randomIndex);
- List<Integer> randomOrder = new ArrayList<>();
- for (int i = 0; i < indexes.length; i++) {
- randomOrder.add(i);
- }
- Collections.shuffle(randomOrder);
- int[] permutation = new int[randomOrder.size()];
- for (int i = 0; i < randomOrder.size(); i++) {
- permutation[i] = randomOrder.get(i);
- }
-
- TimingStatistics.Call call = stats.newCall(leadTimeUsec);
- if (useSetQuick) {
- for (int i : permutation) {
- v.setQuick(indexes[i], values[i]);
- }
- } else {
- for (int i : permutation) {
- v.set(indexes[i], values[i]);
- }
- }
- return call.end(maxTimeUsec);
- }
-
- public void incrementalCreateBenchmark() {
- TimingStatistics stats = new TimingStatistics();
- for (int i = 0; i < loop; i++) {
- vectors[0][vIndex(i)] = new DenseVector(cardinality);
- if (buildVectorIncrementally(stats, vIndex(i), vectors[0][vIndex(i)], false)) {
- break;
- }
- }
- printStats(stats, CREATE_INCREMENTALLY, DENSE_VECTOR);
-
- stats = new TimingStatistics();
- for (int i = 0; i < loop; i++) {
- vectors[1][vIndex(i)] = new RandomAccessSparseVector(cardinality);
- if (buildVectorIncrementally(stats, vIndex(i), vectors[1][vIndex(i)], false)) {
- break;
- }
- }
- printStats(stats, CREATE_INCREMENTALLY, RAND_SPARSE_VECTOR);
-
- stats = new TimingStatistics();
- for (int i = 0; i < loop; i++) {
- vectors[2][vIndex(i)] = new SequentialAccessSparseVector(cardinality);
- if (buildVectorIncrementally(stats, vIndex(i), vectors[2][vIndex(i)], false)) {
- break;
- }
- }
- printStats(stats, CREATE_INCREMENTALLY, SEQ_SPARSE_VECTOR);
-
- if (numClusters > 0) {
- stats = new TimingStatistics();
- for (int i = 0; i < loop; i++) {
- clusters[cIndex(i)] = new RandomAccessSparseVector(cardinality);
- if (buildVectorIncrementally(stats, vIndex(i), clusters[cIndex(i)], false)) {
- break;
- }
- }
- printStats(stats, CREATE_INCREMENTALLY, CLUSTERS);
- }
- }
-
- public int vIndex(int i) {
- return i % numVectors;
- }
-
- public int cIndex(int i) {
- return i % numClusters;
- }
-
- public static void main(String[] args) throws IOException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option vectorSizeOpt = obuilder
- .withLongName("vectorSize")
- .withRequired(false)
- .withArgument(abuilder.withName("vs").withDefault(1000000).create())
- .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create();
- Option numNonZeroOpt = obuilder
- .withLongName("numNonZero")
- .withRequired(false)
- .withArgument(abuilder.withName("nz").withDefault(1000).create())
- .withDescription("Size of the vector. Default: 1000").withShortName("nz").create();
- Option numVectorsOpt = obuilder
- .withLongName("numVectors")
- .withRequired(false)
- .withArgument(abuilder.withName("nv").withDefault(25).create())
- .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create();
- Option numClustersOpt = obuilder
- .withLongName("numClusters")
- .withRequired(false)
- .withArgument(abuilder.withName("nc").withDefault(0).create())
- .withDescription("Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0")
- .withShortName("nc").create();
- Option numOpsOpt = obuilder
- .withLongName("numOps")
- .withRequired(false)
- .withArgument(abuilder.withName("numOps").withDefault(10).create())
- .withDescription(
- "Number of operations to do per timer. "
- + "E.g In distance measure, the distance is calculated numOps times"
- + " and the total time is measured. Default: 10").withShortName("no").create();
-
- Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt)
- .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelpWithGenericOptions(group);
- return;
- }
-
- int cardinality = 1000000;
- if (cmdLine.hasOption(vectorSizeOpt)) {
- cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt));
-
- }
-
- int numClusters = 0;
- if (cmdLine.hasOption(numClustersOpt)) {
- numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt));
- }
-
- int numNonZero = 1000;
- if (cmdLine.hasOption(numNonZeroOpt)) {
- numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt));
- }
-
- int numVectors = 25;
- if (cmdLine.hasOption(numVectorsOpt)) {
- numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt));
-
- }
-
- int numOps = 10;
- if (cmdLine.hasOption(numOpsOpt)) {
- numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt));
-
- }
- VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps);
- runBenchmark(mark);
-
- // log.info("\n{}", mark);
- log.info("\n{}", mark.asCsvString());
- } catch (OptionException e) {
- CommandLineUtil.printHelp(group);
- }
- }
-
- private static void runBenchmark(VectorBenchmarks mark) throws IOException {
- // Required to set up data.
- mark.createData();
-
- mark.createBenchmark();
- if (mark.cardinality < 200000) {
- // Too slow.
- mark.incrementalCreateBenchmark();
- }
-
- new CloneBenchmark(mark).benchmark();
- new DotBenchmark(mark).benchmark();
- new PlusBenchmark(mark).benchmark();
- new MinusBenchmark(mark).benchmark();
- new TimesBenchmark(mark).benchmark();
- new SerializationBenchmark(mark).benchmark();
-
- DistanceBenchmark distanceBenchmark = new DistanceBenchmark(mark);
- distanceBenchmark.benchmark(new CosineDistanceMeasure());
- distanceBenchmark.benchmark(new SquaredEuclideanDistanceMeasure());
- distanceBenchmark.benchmark(new EuclideanDistanceMeasure());
- distanceBenchmark.benchmark(new ManhattanDistanceMeasure());
- distanceBenchmark.benchmark(new TanimotoDistanceMeasure());
- distanceBenchmark.benchmark(new ChebyshevDistanceMeasure());
- distanceBenchmark.benchmark(new MinkowskiDistanceMeasure());
-
- if (mark.numClusters > 0) {
- ClosestCentroidBenchmark centroidBenchmark = new ClosestCentroidBenchmark(mark);
- centroidBenchmark.benchmark(new CosineDistanceMeasure());
- centroidBenchmark.benchmark(new SquaredEuclideanDistanceMeasure());
- centroidBenchmark.benchmark(new EuclideanDistanceMeasure());
- centroidBenchmark.benchmark(new ManhattanDistanceMeasure());
- centroidBenchmark.benchmark(new TanimotoDistanceMeasure());
- centroidBenchmark.benchmark(new ChebyshevDistanceMeasure());
- centroidBenchmark.benchmark(new MinkowskiDistanceMeasure());
- }
- }
-
- private String asCsvString() {
- List<String> keys = new ArrayList<>(statsMap.keySet());
- Collections.sort(keys);
- Map<Integer,String> implMap = new HashMap<>();
- for (Entry<String,Integer> e : implType.entrySet()) {
- implMap.put(e.getValue(), e.getKey());
- }
-
- StringBuilder sb = new StringBuilder(1000);
- for (String benchmarkName : keys) {
- int i = 0;
- for (String[] stats : statsMap.get(benchmarkName)) {
- if (stats.length < 8) {
- continue;
- }
- sb.append(benchmarkName).append(',');
- sb.append(implMap.get(i++)).append(',');
- sb.append(stats[7].trim().split("=|/")[1].trim());
- sb.append('\n');
- }
- }
- sb.append('\n');
- return sb.toString();
- }
-
- @Override
- public String toString() {
- int pad = 24;
- StringBuilder sb = new StringBuilder(1000);
- sb.append(StringUtils.rightPad("BenchMarks", pad));
- for (int i = 0; i < implType.size(); i++) {
- for (Entry<String,Integer> e : implType.entrySet()) {
- if (e.getValue() == i) {
- sb.append(StringUtils.rightPad(e.getKey(), pad).substring(0, pad));
- break;
- }
- }
- }
- sb.append('\n');
- List<String> keys = new ArrayList<>(statsMap.keySet());
- Collections.sort(keys);
- for (String benchmarkName : keys) {
- List<String[]> implTokenizedStats = statsMap.get(benchmarkName);
- int maxStats = 0;
- for (String[] stat : implTokenizedStats) {
- maxStats = Math.max(maxStats, stat.length);
- }
-
- for (int i = 0; i < maxStats; i++) {
- boolean printedName = false;
- for (String[] stats : implTokenizedStats) {
- if (i == 0 && !printedName) {
- sb.append(StringUtils.rightPad(benchmarkName, pad));
- printedName = true;
- } else if (!printedName) {
- printedName = true;
- sb.append(StringUtils.rightPad("", pad));
- }
- if (stats.length > i) {
- sb.append(StringUtils.rightPad(stats[i], pad));
- } else {
- sb.append(StringUtils.rightPad("", pad));
- }
-
- }
- sb.append('\n');
- }
- sb.append('\n');
- }
- return sb.toString();
- }
-
- public BenchmarkRunner getRunner() {
- return runner;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java
deleted file mode 100644
index b220993..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java
+++ /dev/null
@@ -1,465 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.cassandra;
-
-import com.google.common.base.Preconditions;
-import me.prettyprint.cassandra.model.HColumnImpl;
-import me.prettyprint.cassandra.serializers.BytesArraySerializer;
-import me.prettyprint.cassandra.serializers.FloatSerializer;
-import me.prettyprint.cassandra.serializers.LongSerializer;
-import me.prettyprint.cassandra.service.OperationType;
-import me.prettyprint.hector.api.Cluster;
-import me.prettyprint.hector.api.ConsistencyLevelPolicy;
-import me.prettyprint.hector.api.HConsistencyLevel;
-import me.prettyprint.hector.api.Keyspace;
-import me.prettyprint.hector.api.beans.ColumnSlice;
-import me.prettyprint.hector.api.beans.HColumn;
-import me.prettyprint.hector.api.factory.HFactory;
-import me.prettyprint.hector.api.mutation.Mutator;
-import me.prettyprint.hector.api.query.ColumnQuery;
-import me.prettyprint.hector.api.query.CountQuery;
-import me.prettyprint.hector.api.query.SliceQuery;
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.NoSuchUserException;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.Cache;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.Retriever;
-import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray;
-import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-
-import java.io.Closeable;
-import java.util.Collection;
-import java.util.List;
-import java.util.concurrent.atomic.AtomicReference;
-
-/**
- * A {@link DataModel} based on a Cassandra keyspace. By default it uses keyspace "recommender" but this
- * can be configured. Create the keyspace before using this class; this can be done on the Cassandra command
- * line with a command linke {@code create keyspace recommender;}.
- *
- * Within the keyspace, this model uses four column families:
- *
- * First, it uses a column family called "users". This is keyed by the user ID as an 8-byte long.
- * It contains a column for every preference the user expresses. The column name is item ID, again as
- * an 8-byte long, and value is a floating point value represnted as an IEEE 32-bit floating poitn value.
- *
- * It uses an analogous column family called "items" for the same data, but keyed by item ID rather
- * than user ID. In this column family, column names are user IDs instead.
- *
- * It uses a column family called "userIDs" as well, with an identical schema. It has one row under key
- * 0. IT contains a column for every user ID in th emodel. It has no values.
- *
- * Finally it also uses an analogous column family "itemIDs" containing item IDs.
- *
- * Each of these four column families needs to be created ahead of time. Again the
- * Cassandra CLI can be used to do so, with commands like {@code create column family users;}.
- *
- * Note that this thread uses a long-lived Cassandra client which will run until terminated. You
- * must {@link #close()} this implementation when done or the JVM will not terminate.
- *
- * This implementation still relies heavily on reading data into memory and caching,
- * as it remains too data-intensive to be effective even against Cassandra. It will take some time to
- * "warm up" as the first few requests will block loading user and item data into caches. This is still going
- * to send a great deal of query traffic to Cassandra. It would be advisable to employ caching wrapper
- * classes in your implementation, like {@link org.apache.mahout.cf.taste.impl.recommender.CachingRecommender}
- * or {@link org.apache.mahout.cf.taste.impl.similarity.CachingItemSimilarity}.
- */
-public final class CassandraDataModel implements DataModel, Closeable {
-
- /** Default Cassandra host. Default: localhost */
- private static final String DEFAULT_HOST = "localhost";
-
- /** Default Cassandra port. Default: 9160 */
- private static final int DEFAULT_PORT = 9160;
-
- /** Default Cassandra keyspace. Default: recommender */
- private static final String DEFAULT_KEYSPACE = "recommender";
-
- static final String USERS_CF = "users";
- static final String ITEMS_CF = "items";
- static final String USER_IDS_CF = "userIDs";
- static final String ITEM_IDS_CF = "itemIDs";
- private static final long ID_ROW_KEY = 0L;
- private static final byte[] EMPTY = new byte[0];
-
- private final Cluster cluster;
- private final Keyspace keyspace;
- private final Cache<Long,PreferenceArray> userCache;
- private final Cache<Long,PreferenceArray> itemCache;
- private final Cache<Long,FastIDSet> itemIDsFromUserCache;
- private final Cache<Long,FastIDSet> userIDsFromItemCache;
- private final AtomicReference<Integer> userCountCache;
- private final AtomicReference<Integer> itemCountCache;
-
- /**
- * Uses the standard Cassandra host and port (localhost:9160), and keyspace name ("recommender").
- */
- public CassandraDataModel() {
- this(DEFAULT_HOST, DEFAULT_PORT, DEFAULT_KEYSPACE);
- }
-
- /**
- * @param host Cassandra server host name
- * @param port Cassandra server port
- * @param keyspaceName name of Cassandra keyspace to use
- */
- public CassandraDataModel(String host, int port, String keyspaceName) {
-
- Preconditions.checkNotNull(host);
- Preconditions.checkArgument(port > 0, "port must be greater then 0!");
- Preconditions.checkNotNull(keyspaceName);
-
- cluster = HFactory.getOrCreateCluster(CassandraDataModel.class.getSimpleName(), host + ':' + port);
- keyspace = HFactory.createKeyspace(keyspaceName, cluster);
- keyspace.setConsistencyLevelPolicy(new OneConsistencyLevelPolicy());
-
- userCache = new Cache<>(new UserPrefArrayRetriever(), 1 << 20);
- itemCache = new Cache<>(new ItemPrefArrayRetriever(), 1 << 20);
- itemIDsFromUserCache = new Cache<>(new ItemIDsFromUserRetriever(), 1 << 20);
- userIDsFromItemCache = new Cache<>(new UserIDsFromItemRetriever(), 1 << 20);
- userCountCache = new AtomicReference<>(null);
- itemCountCache = new AtomicReference<>(null);
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() {
- SliceQuery<Long,Long,?> query = buildNoValueSliceQuery(USER_IDS_CF);
- query.setKey(ID_ROW_KEY);
- FastIDSet userIDs = new FastIDSet();
- for (HColumn<Long,?> userIDColumn : query.execute().get().getColumns()) {
- userIDs.add(userIDColumn.getName());
- }
- return userIDs.iterator();
- }
-
- @Override
- public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
- return userCache.get(userID);
- }
-
- @Override
- public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
- return itemIDsFromUserCache.get(userID);
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() {
- SliceQuery<Long,Long,?> query = buildNoValueSliceQuery(ITEM_IDS_CF);
- query.setKey(ID_ROW_KEY);
- FastIDSet itemIDs = new FastIDSet();
- for (HColumn<Long,?> itemIDColumn : query.execute().get().getColumns()) {
- itemIDs.add(itemIDColumn.getName());
- }
- return itemIDs.iterator();
- }
-
- @Override
- public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
- return itemCache.get(itemID);
- }
-
- @Override
- public Float getPreferenceValue(long userID, long itemID) {
- ColumnQuery<Long,Long,Float> query =
- HFactory.createColumnQuery(keyspace, LongSerializer.get(), LongSerializer.get(), FloatSerializer.get());
- query.setColumnFamily(USERS_CF);
- query.setKey(userID);
- query.setName(itemID);
- HColumn<Long,Float> column = query.execute().get();
- return column == null ? null : column.getValue();
- }
-
- @Override
- public Long getPreferenceTime(long userID, long itemID) {
- ColumnQuery<Long,Long,?> query =
- HFactory.createColumnQuery(keyspace, LongSerializer.get(), LongSerializer.get(), BytesArraySerializer.get());
- query.setColumnFamily(USERS_CF);
- query.setKey(userID);
- query.setName(itemID);
- HColumn<Long,?> result = query.execute().get();
- return result == null ? null : result.getClock();
- }
-
- @Override
- public int getNumItems() {
- Integer itemCount = itemCountCache.get();
- if (itemCount == null) {
- CountQuery<Long,Long> countQuery =
- HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get());
- countQuery.setKey(ID_ROW_KEY);
- countQuery.setColumnFamily(ITEM_IDS_CF);
- countQuery.setRange(null, null, Integer.MAX_VALUE);
- itemCount = countQuery.execute().get();
- itemCountCache.set(itemCount);
- }
- return itemCount;
- }
-
- @Override
- public int getNumUsers() {
- Integer userCount = userCountCache.get();
- if (userCount == null) {
- CountQuery<Long,Long> countQuery =
- HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get());
- countQuery.setKey(ID_ROW_KEY);
- countQuery.setColumnFamily(USER_IDS_CF);
- countQuery.setRange(null, null, Integer.MAX_VALUE);
- userCount = countQuery.execute().get();
- userCountCache.set(userCount);
- }
- return userCount;
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
- /*
- CountQuery<Long,Long> query = HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get());
- query.setColumnFamily(ITEMS_CF);
- query.setKey(itemID);
- query.setRange(null, null, Integer.MAX_VALUE);
- return query.execute().get();
- */
- return userIDsFromItemCache.get(itemID).size();
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
- FastIDSet userIDs1 = userIDsFromItemCache.get(itemID1);
- FastIDSet userIDs2 = userIDsFromItemCache.get(itemID2);
- return userIDs1.size() < userIDs2.size()
- ? userIDs2.intersectionSize(userIDs1)
- : userIDs1.intersectionSize(userIDs2);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) {
-
- if (Float.isNaN(value)) {
- value = 1.0f;
- }
-
- long now = System.currentTimeMillis();
-
- Mutator<Long> mutator = HFactory.createMutator(keyspace, LongSerializer.get());
-
- HColumn<Long,Float> itemForUsers = new HColumnImpl<>(LongSerializer.get(), FloatSerializer.get());
- itemForUsers.setName(itemID);
- itemForUsers.setClock(now);
- itemForUsers.setValue(value);
- mutator.addInsertion(userID, USERS_CF, itemForUsers);
-
- HColumn<Long,Float> userForItems = new HColumnImpl<>(LongSerializer.get(), FloatSerializer.get());
- userForItems.setName(userID);
- userForItems.setClock(now);
- userForItems.setValue(value);
- mutator.addInsertion(itemID, ITEMS_CF, userForItems);
-
- HColumn<Long,byte[]> userIDs = new HColumnImpl<>(LongSerializer.get(), BytesArraySerializer.get());
- userIDs.setName(userID);
- userIDs.setClock(now);
- userIDs.setValue(EMPTY);
- mutator.addInsertion(ID_ROW_KEY, USER_IDS_CF, userIDs);
-
- HColumn<Long,byte[]> itemIDs = new HColumnImpl<>(LongSerializer.get(), BytesArraySerializer.get());
- itemIDs.setName(itemID);
- itemIDs.setClock(now);
- itemIDs.setValue(EMPTY);
- mutator.addInsertion(ID_ROW_KEY, ITEM_IDS_CF, itemIDs);
-
- mutator.execute();
- }
-
- @Override
- public void removePreference(long userID, long itemID) {
- Mutator<Long> mutator = HFactory.createMutator(keyspace, LongSerializer.get());
- mutator.addDeletion(userID, USERS_CF, itemID, LongSerializer.get());
- mutator.addDeletion(itemID, ITEMS_CF, userID, LongSerializer.get());
- mutator.execute();
- // Not deleting from userIDs, itemIDs though
- }
-
- /**
- * @return true
- */
- @Override
- public boolean hasPreferenceValues() {
- return true;
- }
-
- /**
- * @return Float#NaN
- */
- @Override
- public float getMaxPreference() {
- return Float.NaN;
- }
-
- /**
- * @return Float#NaN
- */
- @Override
- public float getMinPreference() {
- return Float.NaN;
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- userCache.clear();
- itemCache.clear();
- userIDsFromItemCache.clear();
- itemIDsFromUserCache.clear();
- userCountCache.set(null);
- itemCountCache.set(null);
- }
-
- @Override
- public String toString() {
- return "CassandraDataModel[" + keyspace + ']';
- }
-
- @Override
- public void close() {
- HFactory.shutdownCluster(cluster);
- }
-
-
- private SliceQuery<Long,Long,byte[]> buildNoValueSliceQuery(String cf) {
- SliceQuery<Long,Long,byte[]> query =
- HFactory.createSliceQuery(keyspace, LongSerializer.get(), LongSerializer.get(), BytesArraySerializer.get());
- query.setColumnFamily(cf);
- query.setRange(null, null, false, Integer.MAX_VALUE);
- return query;
- }
-
- private SliceQuery<Long,Long,Float> buildValueSliceQuery(String cf) {
- SliceQuery<Long,Long,Float> query =
- HFactory.createSliceQuery(keyspace, LongSerializer.get(), LongSerializer.get(), FloatSerializer.get());
- query.setColumnFamily(cf);
- query.setRange(null, null, false, Integer.MAX_VALUE);
- return query;
- }
-
-
- private static final class OneConsistencyLevelPolicy implements ConsistencyLevelPolicy {
- @Override
- public HConsistencyLevel get(OperationType op) {
- return HConsistencyLevel.ONE;
- }
-
- @Override
- public HConsistencyLevel get(OperationType op, String cfName) {
- return HConsistencyLevel.ONE;
- }
- }
-
- private final class UserPrefArrayRetriever implements Retriever<Long, PreferenceArray> {
- @Override
- public PreferenceArray get(Long userID) throws TasteException {
- SliceQuery<Long,Long,Float> query = buildValueSliceQuery(USERS_CF);
- query.setKey(userID);
-
- ColumnSlice<Long,Float> result = query.execute().get();
- if (result == null) {
- throw new NoSuchUserException(userID);
- }
- List<HColumn<Long,Float>> itemIDColumns = result.getColumns();
- if (itemIDColumns.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
- int size = itemIDColumns.size();
- PreferenceArray prefs = new GenericUserPreferenceArray(size);
- prefs.setUserID(0, userID);
- for (int i = 0; i < size; i++) {
- HColumn<Long,Float> itemIDColumn = itemIDColumns.get(i);
- prefs.setItemID(i, itemIDColumn.getName());
- prefs.setValue(i, itemIDColumn.getValue());
- }
- return prefs;
- }
- }
-
- private final class ItemPrefArrayRetriever implements Retriever<Long, PreferenceArray> {
- @Override
- public PreferenceArray get(Long itemID) throws TasteException {
- SliceQuery<Long,Long,Float> query = buildValueSliceQuery(ITEMS_CF);
- query.setKey(itemID);
- ColumnSlice<Long,Float> result = query.execute().get();
- if (result == null) {
- throw new NoSuchItemException(itemID);
- }
- List<HColumn<Long,Float>> userIDColumns = result.getColumns();
- if (userIDColumns.isEmpty()) {
- throw new NoSuchItemException(itemID);
- }
- int size = userIDColumns.size();
- PreferenceArray prefs = new GenericItemPreferenceArray(size);
- prefs.setItemID(0, itemID);
- for (int i = 0; i < size; i++) {
- HColumn<Long,Float> userIDColumn = userIDColumns.get(i);
- prefs.setUserID(i, userIDColumn.getName());
- prefs.setValue(i, userIDColumn.getValue());
- }
- return prefs;
- }
- }
-
- private final class UserIDsFromItemRetriever implements Retriever<Long, FastIDSet> {
- @Override
- public FastIDSet get(Long itemID) throws TasteException {
- SliceQuery<Long,Long,byte[]> query = buildNoValueSliceQuery(ITEMS_CF);
- query.setKey(itemID);
- ColumnSlice<Long,byte[]> result = query.execute().get();
- if (result == null) {
- throw new NoSuchItemException(itemID);
- }
- List<HColumn<Long,byte[]>> columns = result.getColumns();
- FastIDSet userIDs = new FastIDSet(columns.size());
- for (HColumn<Long,?> userIDColumn : columns) {
- userIDs.add(userIDColumn.getName());
- }
- return userIDs;
- }
- }
-
- private final class ItemIDsFromUserRetriever implements Retriever<Long, FastIDSet> {
- @Override
- public FastIDSet get(Long userID) throws TasteException {
- SliceQuery<Long,Long,byte[]> query = buildNoValueSliceQuery(USERS_CF);
- query.setKey(userID);
- FastIDSet itemIDs = new FastIDSet();
- ColumnSlice<Long,byte[]> result = query.execute().get();
- if (result == null) {
- throw new NoSuchUserException(userID);
- }
- List<HColumn<Long,byte[]>> columns = result.getColumns();
- if (columns.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
- for (HColumn<Long,?> itemIDColumn : columns) {
- itemIDs.add(itemIDColumn.getName());
- }
- return itemIDs;
- }
- }
-
-}

r***@apache.org

2018-06-27 14:51:52 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
deleted file mode 100644
index 8d92551..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
-
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class LuceneIterableTest extends MahoutTestCase {
-
- private static final String [] DOCS = {
- "The quick red fox jumped over the lazy brown dogs.",
- "Mary had a little lamb whose fleece was white as snow.",
- "Moby Dick is a story of a whale and a man obsessed.",
- "The robber wore a black fleece jacket and a baseball cap.",
- "The English Springer Spaniel is the best of all dogs."
- };
-
- private RAMDirectory directory;
-
- private final FieldType TYPE_NO_TERM_VECTORS = new FieldType();
-
- private final FieldType TYPE_TERM_VECTORS = new FieldType();
-
- @Before
- public void before() throws IOException {
-
- TYPE_NO_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- TYPE_NO_TERM_VECTORS.setTokenized(true);
- TYPE_NO_TERM_VECTORS.setStoreTermVectors(false);
- TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false);
- TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false);
- TYPE_NO_TERM_VECTORS.freeze();
-
- TYPE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- TYPE_TERM_VECTORS.setTokenized(true);
- TYPE_TERM_VECTORS.setStored(true);
- TYPE_TERM_VECTORS.setStoreTermVectors(true);
- TYPE_TERM_VECTORS.setStoreTermVectorPositions(true);
- TYPE_TERM_VECTORS.setStoreTermVectorOffsets(true);
- TYPE_TERM_VECTORS.freeze();
-
- directory = createTestIndex(TYPE_TERM_VECTORS);
- }
-
- @Test
- public void testIterable() throws Exception {
- IndexReader reader = DirectoryReader.open(directory);
- Weight weight = new TFIDF();
- TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
- LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
-
- //TODO: do something more meaningful here
- for (Vector vector : iterable) {
- assertNotNull(vector);
- assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector);
- assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0);
- assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
- }
-
- iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, 3);
-
- //TODO: do something more meaningful here
- for (Vector vector : iterable) {
- assertNotNull(vector);
- assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector);
- assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0);
- assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
- }
-
- }
-
- @Test(expected = IllegalStateException.class)
- public void testIterableNoTermVectors() throws IOException {
- RAMDirectory directory = createTestIndex(TYPE_NO_TERM_VECTORS);
- IndexReader reader = DirectoryReader.open(directory);
-
- Weight weight = new TFIDF();
- TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
- LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
-
- Iterator<Vector> iterator = iterable.iterator();
- Iterators.advance(iterator, 1);
- }
-
- @Test
- public void testIterableSomeNoiseTermVectors() throws IOException {
- //get noise vectors
- RAMDirectory directory = createTestIndex(TYPE_TERM_VECTORS, new RAMDirectory(), 0);
- //get real vectors
- createTestIndex(TYPE_NO_TERM_VECTORS, directory, 5);
- IndexReader reader = DirectoryReader.open(directory);
-
- Weight weight = new TFIDF();
- TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-
- boolean exceptionThrown;
- //0 percent tolerance
- LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo, weight);
- try {
- Iterables.skip(iterable, Iterables.size(iterable));
- exceptionThrown = false;
- }
- catch(IllegalStateException ise) {
- exceptionThrown = true;
- }
- assertTrue(exceptionThrown);
-
- //100 percent tolerance
- iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 1.0);
- try {
- Iterables.skip(iterable, Iterables.size(iterable));
- exceptionThrown = false;
- }
- catch(IllegalStateException ise) {
- exceptionThrown = true;
- }
- assertFalse(exceptionThrown);
-
- //50 percent tolerance
- iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 0.5);
- Iterator<Vector> iterator = iterable.iterator();
- Iterators.advance(iterator, 5);
-
- try {
- Iterators.advance(iterator, Iterators.size(iterator));
- exceptionThrown = false;
- }
- catch(IllegalStateException ise) {
- exceptionThrown = true;
- }
- assertTrue(exceptionThrown);
- }
-
- static RAMDirectory createTestIndex(FieldType fieldType) throws IOException {
- return createTestIndex(fieldType, new RAMDirectory(), 0);
- }
-
- static RAMDirectory createTestIndex(FieldType fieldType,
- RAMDirectory directory,
- int startingId) throws IOException {
-
- try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()))) {
- for (int i = 0; i < DOCS.length; i++) {
- Document doc = new Document();
- Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
- doc.add(id);
- //Store both position and offset information
- Field text = new Field("content", DOCS[i], fieldType);
- doc.add(text);
- Field text2 = new Field("content2", DOCS[i], fieldType);
- doc.add(text2);
- writer.addDocument(doc);
- }
- }
- return directory;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/date.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/date.arff b/integration/src/test/resources/date.arff
deleted file mode 100644
index 9daeb52..0000000
--- a/integration/src/test/resources/date.arff
+++ /dev/null
@@ -1,18 +0,0 @@
- % Comments
- %
- % Comments go here %
- @RELATION MahoutDateTest
-
- @ATTRIBUTE junk NUMERIC
- @ATTRIBUTE date1 date
- @ATTRIBUTE date2 date "yyyy.MM.dd G 'at' HH:mm:ss z"
- @ATTRIBUTE date3 date "EEE, MMM d, ''yy"
- @ATTRIBUTE date4 date "K:mm a, z"
- @ATTRIBUTE date5 date "yyyyy.MMMMM.dd GGG hh:mm aaa"
- @ATTRIBUTE date6 date "EEE, d MMM yyyy HH:mm:ss Z"
-
-
-
- @DATA
- {0 1,1 "2001-07-04T12:08:56",2 "2001.07.04 AD at 12:08:56 PDT",3 "Wed, Jul 4, '01,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.04 AD 12:08 PM" ,6 "Wed, 4 Jul 2001 12:08:56 -0700" }
- {0 2,1 "2001-08-04T12:09:56",2 "2011.07.04 AD at 12:08:56 PDT",3 "Mon, Jul 4, '11,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.14 AD 12:08 PM" ,6 "Mon, 4 Jul 2011 12:08:56 -0700" }

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/expected-arff-dictionary-2.csv
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/expected-arff-dictionary-2.csv b/integration/src/test/resources/expected-arff-dictionary-2.csv
deleted file mode 100644
index acb1c43..0000000
--- a/integration/src/test/resources/expected-arff-dictionary-2.csv
+++ /dev/null
@@ -1,22 +0,0 @@
-Label bindings for Relation golf
-temperature,1
-humidity,2
-outlook,0
-class,4
-windy,3
-
-Values for nominal attributes
-3
-outlook
-3
-rain,3
-overcast,2
-sunny,1
-class
-2
-play,2
-dont_play,1
-windy
-2
-false,1
-true,2

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/expected-arff-dictionary.csv
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/expected-arff-dictionary.csv b/integration/src/test/resources/expected-arff-dictionary.csv
deleted file mode 100644
index f2dac13..0000000
--- a/integration/src/test/resources/expected-arff-dictionary.csv
+++ /dev/null
@@ -1,22 +0,0 @@
-Label bindings for Relation golf
-humidity,2
-windy,3
-outlook,0
-class,4
-temperature,1
-
-Values for nominal attributes
-3
-windy
-2
-true,2
-false,1
-outlook
-3
-sunny,1
-overcast,2
-rain,3
-class
-2
-play,2
-dont_play,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/expected-arff-schema-2.json
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/expected-arff-schema-2.json b/integration/src/test/resources/expected-arff-schema-2.json
deleted file mode 100644
index b73f55c..0000000
--- a/integration/src/test/resources/expected-arff-schema-2.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"values":["rain","overcast","sunny"],"label":"false","attribute":"outlook","type":"categorical"},{"label":"false","attribute":"temperature","type":"numerical"},{"label":"false","attribute":"humidity","type":"numerical"},{"values":["false","true"],"label":"false","attribute":"windy","type":"categorical"},{"values":["play","dont_play"],"label":"true","attribute":"class","type":"categorical"}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/expected-arff-schema.json
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/expected-arff-schema.json b/integration/src/test/resources/expected-arff-schema.json
deleted file mode 100644
index 36e0c89..0000000
--- a/integration/src/test/resources/expected-arff-schema.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"values":["sunny","overcast","rain"],"attribute":"outlook","label":"false","type":"categorical"},{"attribute":"temperature","label":"false","type":"numerical"},{"attribute":"humidity","label":"false","type":"numerical"},{"values":["true","false"],"attribute":"windy","label":"false","type":"categorical"},{"values":["play","dont_play"],"attribute":"class","label":"true","type":"categorical"}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/non-numeric-1.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/non-numeric-1.arff b/integration/src/test/resources/non-numeric-1.arff
deleted file mode 100644
index bf0c746..0000000
--- a/integration/src/test/resources/non-numeric-1.arff
+++ /dev/null
@@ -1,24 +0,0 @@
- % Comments
- %
- % Comments go here %
- @RELATION Mahout
-
- @ATTRIBUTE junk NUMERIC
- @ATTRIBUTE foo NUMERIC
- @ATTRIBUTE bar {c,d,'xy, numeric','marc o\'polo', e}
- @ATTRIBUTE hockey string
- @ATTRIBUTE football date "yyyy-MM-dd"
-
-
-
- @DATA
- {2 c,3 gretzky,4 1973-10-23}
- {1 2.9,2 d,3 orr,4 1973-11-23}
- {2 c,3 bossy,4 1981-10-23}
- {1 2.6,2 c,3 lefleur,4 1989-10-23}
- {3 esposito,4 1973-04-23}
- {1 23.2,2 d,3 chelios,4 1999-2-23}
- {3 richard,4 1973-10-12}
- {3 howe,4 1983-06-23}
- {0 2.2,2 d,3 messier,4 2008-11-23}
- {2 c,3 roy,4 1973-10-13}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/non-numeric-2.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/non-numeric-2.arff b/integration/src/test/resources/non-numeric-2.arff
deleted file mode 100644
index 6df35b5..0000000
--- a/integration/src/test/resources/non-numeric-2.arff
+++ /dev/null
@@ -1,24 +0,0 @@
- % Comments
- %
- % Comments go here %
- @RELATION Mahout
-
- @ATTRIBUTE junk NUMERIC
- @ATTRIBUTE foo NUMERIC
- @ATTRIBUTE test {f,z}
- @ATTRIBUTE hockey string
- @ATTRIBUTE football date "yyyy-MM-dd"
-
-
-
- @DATA
- {2 f,3 gretzky,4 1973-10-23}
- {1 2.9,2 z,3 orr,4 1973-11-23}
- {2 f,3 bossy,4 1981-10-23}
- {1 2.6,2 f,3 lefleur,4 1989-10-23}
- {3 esposito,4 1973-04-23}
- {1 23.2,2 z,3 chelios,4 1999-2-23}
- {3 richard,4 1973-10-12}
- {3 howe,4 1983-06-23}
- {0 2.2,2 f,3 messier,4 2008-11-23}
- {2 f,3 roy,4 1973-10-13}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/quoted-id.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/quoted-id.arff b/integration/src/test/resources/quoted-id.arff
deleted file mode 100644
index 1f724ed..0000000
--- a/integration/src/test/resources/quoted-id.arff
+++ /dev/null
@@ -1,9 +0,0 @@
-@RELATION 'quotes'
-@ATTRIBUTE 'theNumeric' NUMERIC
-@ATTRIBUTE "theInteger" INTEGER
-@ATTRIBUTE theReal REAL
-@ATTRIBUTE theNominal {"double-quote", 'single-quote', no-quote}
-@DATA
-1.0,2,3.0,"no-quote"
-4.0,5,6.0,single-quote
-7.0,8,9.0,'double-quote'

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/sample-dense.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/sample-dense.arff b/integration/src/test/resources/sample-dense.arff
deleted file mode 100644
index dbf5dd2..0000000
--- a/integration/src/test/resources/sample-dense.arff
+++ /dev/null
@@ -1,20 +0,0 @@
- % Comments
- %
- % Comments go here %
- @RELATION golf
-
- @ATTRIBUTE outlook {sunny,overcast, rain}
- @ATTRIBUTE temperature NUMERIC
- @ATTRIBUTE humidity NUMERIC
- @ATTRIBUTE windy {false, true}
- @ATTRIBUTE class {dont_play, play}
-
-
-
- @DATA
- sunny, 65, ?, false, dont_play, {2}
- sunny, 80, 90, true, dont_play
- overcast, 83, 78, false, play ,{3}
- rain, 70, 96, false, play
- rain, 68, 80, false, play
- rain, 65, 70, true, play

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/sample-sparse.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/sample-sparse.arff b/integration/src/test/resources/sample-sparse.arff
deleted file mode 100644
index 25e1f9c..0000000
--- a/integration/src/test/resources/sample-sparse.arff
+++ /dev/null
@@ -1,24 +0,0 @@
- % Comments
- %
- % Comments go here %
- @RELATION Mahout
-
- @ATTRIBUTE foo NUMERIC
- @ATTRIBUTE bar NUMERIC
- @ATTRIBUTE hockey NUMERIC
- @ATTRIBUTE football NUMERIC
- @ATTRIBUTE tennis NUMERIC
-
-
-
- @DATA
- {1 23.1,2 3.23,3 1.2,4 ?} {5}
- {0 2.9}
- {0 2.7,2 3.2,3 1.3,4 0.2} {10}
- {1 2.6,2 3.1,3 1.23,4 0.2}
- {1 23.0,2 3.6,3 1.2,4 0.2}
- {0 23.2,1 3.9,3 1.7,4 0.2}
- {0 2.6,1 3.2,2 1.2,4 0.3}
- {1 23.0,2 3.2,3 1.23}
- {1 2.2,2 2.94,3 0.2}
- {1 2.9,2 3.1}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/sample.arff
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/sample.arff b/integration/src/test/resources/sample.arff
deleted file mode 100644
index cd04b32..0000000
--- a/integration/src/test/resources/sample.arff
+++ /dev/null
@@ -1,11 +0,0 @@
-%comments
-@RELATION Mahout
-@ATTRIBUTE foo numeric
-@ATTRIBUTE bar numeric
-@ATTRIBUTE timestamp DATE "yyyy-MM-dd HH:mm:ss"
-@ATTRIBUTE junk string
-@ATTRIBUTE theNominal {c,b,a}
-@DATA
-1,2, "2009-01-01 5:55:55", foo, c
-2,3
-{0 5,1 23}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/resources/test.mbox
----------------------------------------------------------------------
diff --git a/integration/src/test/resources/test.mbox b/integration/src/test/resources/test.mbox
deleted file mode 100644
index 99017c0..0000000
--- a/integration/src/test/resources/test.mbox
+++ /dev/null
@@ -1,1038 +0,0 @@
-From dev-return-102527-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 01 21:01:35 2010
-Return-Path: <dev-return-102527-apmail-cocoon-dev-archive=***@cocoon.apache.org>
-Delivered-To: apmail-cocoon-dev-***@www.apache.org
-Received: (qmail 34434 invoked from network); 1 Sep 2010 21:01:34 -0000
-Received: from unknown (HELO mail.apache.org) (140.211.11.3)
- by 140.211.11.9 with SMTP; 1 Sep 2010 21:01:34 -0000
-Received: (qmail 26895 invoked by uid 500); 1 Sep 2010 21:01:34 -0000
-Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
-Received: (qmail 26771 invoked by uid 500); 1 Sep 2010 21:01:33 -0000
-Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
-Precedence: bulk
-list-help: <mailto:dev-***@cocoon.apache.org>
-list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
-List-Post: <mailto:***@cocoon.apache.org>
-Reply-To: ***@cocoon.apache.org
-List-Id: <dev.cocoon.apache.org>
-Delivered-To: mailing list ***@cocoon.apache.org
-Received: (qmail 26764 invoked by uid 99); 1 Sep 2010 21:01:33 -0000
-Received: from Unknown (HELO nike.apache.org) (192.87.106.230)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:01:33 +0000
-X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
- tests=ALL_TRUSTED
-X-Spam-Check-By: apache.org
-Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:01:16 +0000
-Received: from thor (localhost [127.0.0.1])
- by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o81L0sNK020435
- for <***@cocoon.apache.org>; Wed, 1 Sep 2010 21:00:54 GMT
-Message-ID: <***@thor>
-Date: Wed, 1 Sep 2010 17:00:54 -0400 (EDT)
-From: "Douglas Hurbon (JIRA)" <***@apache.org>
-To: ***@cocoon.apache.org
-Subject: [jira] Created: (COCOON-2300) jboss-5.1.0.GA vfszip protocol in
- CharsetFactory
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-X-Virus-Checked: Checked by ClamAV on apache.org
-
-jboss-5.1.0.GA vfszip protocol in CharsetFactory
-------------------------------------------------
-
- Key: COCOON-2300
- URL: https://issues.apache.org/jira/browse/COCOON-2300
- Project: Cocoon
- Issue Type: Bug
- Components: Blocks: Serializers
- Affects Versions: 2.1.11
- Reporter: Douglas Hurbon
- Fix For: 2.1.12-dev (Current SVN)
-
-
-Cocoon fails to initialize on Jboss 5.1 due to the new vfszip protocol it uses for class loading. CharsetFactory expects either jar:/ or file:/
-
-Parsing the vfszip protocol in CharsetFactory solves the problem.
-
---
-This message is automatically generated by JIRA.
--
-You can reply to this email to add a comment to the issue online.
-
-
-From dev-return-102528-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 01 21:03:16 2010
-Return-Path: <dev-return-102528-apmail-cocoon-dev-archive=***@cocoon.apache.org>
-Delivered-To: apmail-cocoon-dev-***@www.apache.org
-Received: (qmail 34824 invoked from network); 1 Sep 2010 21:03:16 -0000
-Received: from unknown (HELO mail.apache.org) (140.211.11.3)
- by 140.211.11.9 with SMTP; 1 Sep 2010 21:03:16 -0000
-Received: (qmail 29126 invoked by uid 500); 1 Sep 2010 21:03:16 -0000
-Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
-Received: (qmail 29044 invoked by uid 500); 1 Sep 2010 21:03:15 -0000
-Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
-Precedence: bulk
-list-help: <mailto:dev-***@cocoon.apache.org>
-list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
-List-Post: <mailto:***@cocoon.apache.org>
-Reply-To: ***@cocoon.apache.org
-List-Id: <dev.cocoon.apache.org>
-Delivered-To: mailing list ***@cocoon.apache.org
-Received: (qmail 28904 invoked by uid 99); 1 Sep 2010 21:03:15 -0000
-Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:03:15 +0000
-X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
- tests=ALL_TRUSTED
-X-Spam-Check-By: apache.org
-Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:03:14 +0000
-Received: from thor (localhost [127.0.0.1])
- by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o81L2sFQ020591
- for <***@cocoon.apache.org>; Wed, 1 Sep 2010 21:02:54 GMT
-Message-ID: <***@thor>
-Date: Wed, 1 Sep 2010 17:02:54 -0400 (EDT)
-From: "Douglas Hurbon (JIRA)" <***@apache.org>
-To: ***@cocoon.apache.org
-Subject: [jira] Updated: (COCOON-2300) jboss-5.1.0.GA vfszip protocol in
- CharsetFactory
-In-Reply-To: <***@thor>
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-
-
- [ https://issues.apache.org/jira/browse/COCOON-2300?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
-
-Douglas Hurbon updated COCOON-2300:
------------------------------------
-
- Attachment: CharsetFactory.patch
-
-Patch for the CharsetFactory running on Jboss 5.1.
-
-> jboss-5.1.0.GA vfszip protocol in CharsetFactory
-> ------------------------------------------------
->
-> Key: COCOON-2300
-> URL: https://issues.apache.org/jira/browse/COCOON-2300
-> Project: Cocoon
-> Issue Type: Bug
-> Components: Blocks: Serializers
-> Affects Versions: 2.1.11
-> Reporter: Douglas Hurbon
-> Fix For: 2.1.12-dev (Current SVN)
->
-> Attachments: CharsetFactory.patch
->
->
-> Cocoon fails to initialize on Jboss 5.1 due to the new vfszip protocol it uses for class loading. CharsetFactory expects either jar:/ or file:/
-> Parsing the vfszip protocol in CharsetFactory solves the problem.
-
---
-This message is automatically generated by JIRA.
--
-You can reply to this email to add a comment to the issue online.
-
-
-From dev-return-102529-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 08 14:41:10 2010
-Return-Path: <dev-return-102529-apmail-cocoon-dev-archive=***@cocoon.apache.org>
-Delivered-To: apmail-cocoon-dev-***@www.apache.org
-Received: (qmail 13040 invoked from network); 8 Sep 2010 14:41:09 -0000
-Received: from unknown (HELO mail.apache.org) (140.211.11.3)
- by 140.211.11.9 with SMTP; 8 Sep 2010 14:41:09 -0000
-Received: (qmail 76345 invoked by uid 500); 8 Sep 2010 14:41:09 -0000
-Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
-Received: (qmail 75377 invoked by uid 500); 8 Sep 2010 14:41:05 -0000
-Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
-Precedence: bulk
-list-help: <mailto:dev-***@cocoon.apache.org>
-list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
-List-Post: <mailto:***@cocoon.apache.org>
-Reply-To: ***@cocoon.apache.org
-List-Id: <dev.cocoon.apache.org>
-Delivered-To: mailing list ***@cocoon.apache.org
-Received: (qmail 75370 invoked by uid 99); 8 Sep 2010 14:41:03 -0000
-Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Sep 2010 14:41:03 +0000
-X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
- tests=ALL_TRUSTED
-X-Spam-Check-By: apache.org
-Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Sep 2010 14:40:59 +0000
-Received: from thor (localhost [127.0.0.1])
- by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o88EebFT004291
- for <***@cocoon.apache.org>; Wed, 8 Sep 2010 14:40:38 GMT
-Message-ID: <***@thor>
-Date: Wed, 8 Sep 2010 10:40:37 -0400 (EDT)
-From: ***@apache.org
-To: ***@cocoon.apache.org
-Subject: [jira] Subscription: COCOON-open-with-patch
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-X-Virus-Checked: Checked by ClamAV on apache.org
-
-Issue Subscription
-Filter: COCOON-open-with-patch (114 issues)
-Subscriber: cocoon
-
-Key Summary
-COCOON-2300 jboss-5.1.0.GA vfszip protocol in CharsetFactory
- https://issues.apache.org/jira/browse/COCOON-2300
-COCOON-2298 IncludeTransformer does not handle multi-valued parameters
- https://issues.apache.org/jira/browse/COCOON-2298
-COCOON-2297 Character encoding does not follow JTidy properties
- https://issues.apache.org/jira/browse/COCOON-2297
-COCOON-2296 [PATCH] Make flowscript work with Commons JXPath 1.3
- https://issues.apache.org/jira/browse/COCOON-2296
-COCOON-2295 integrating FOP-1.0 into Cocoon-2.1.12-dev
- https://issues.apache.org/jira/browse/COCOON-2295
-COCOON-2294 Wrong version number for cocoon-serializers-impl in parent pom for revision 964648
- https://issues.apache.org/jira/browse/COCOON-2294
-COCOON-2290 CLONE - Add a read method to the SitemapComponentTestCase
- https://issues.apache.org/jira/browse/COCOON-2290
-COCOON-2288 Allow usage of SLF4J for traces
- https://issues.apache.org/jira/browse/COCOON-2288
-COCOON-2281 "Communication tools that we use" link to dev mailing list archive comes out at user mailing list archive
- https://issues.apache.org/jira/browse/COCOON-2281
-COCOON-2268 To extend the image reader we need to change the visibility to the parameter of the ImageReader
- https://issues.apache.org/jira/browse/COCOON-2268
-COCOON-2262 container.refresh() is called before embeddedServlet.init()
- https://issues.apache.org/jira/browse/COCOON-2262
-COCOON-2260 wrong parent version in pom of cocoon-flowscript-impl
- https://issues.apache.org/jira/browse/COCOON-2260
-COCOON-2249 XHTMLSerializer uses entity references " and ' which cause JavaScript parse errors
- https://issues.apache.org/jira/browse/COCOON-2249
-COCOON-2246 HttpRequest should handle encoding in getParameter and getParameterValues in the same way
- https://issues.apache.org/jira/browse/COCOON-2246
-COCOON-2233 Update archetypes to current trunk artifact versions
- https://issues.apache.org/jira/browse/COCOON-2233
-COCOON-2222 Add SaxParser configuration properties
- https://issues.apache.org/jira/browse/COCOON-2222
-COCOON-2216 IncludeCacheManager can not perfom parallel includes
- https://issues.apache.org/jira/browse/COCOON-2216
-COCOON-2212 jx:attribute does not check name is correct before proceeding
- https://issues.apache.org/jira/browse/COCOON-2212
-COCOON-2197 Making the cocoon-auth-block acegi-security-sample work
- https://issues.apache.org/jira/browse/COCOON-2197
-COCOON-2173 AbstractCachingProcessingPipeline: Two requests can deadlock each other
- https://issues.apache.org/jira/browse/COCOON-2173
-COCOON-2162 [PATCH] Fix for Paginator when accessing out of bounds Pagination page
- https://issues.apache.org/jira/browse/COCOON-2162
-COCOON-2137 XSD Schemas for CForms Development
- https://issues.apache.org/jira/browse/COCOON-2137
-COCOON-2114 fix sorting in TraversableGenerator
- https://issues.apache.org/jira/browse/COCOON-2114
-COCOON-2108 xmodule:flow-attr Does not accept document objects
- https://issues.apache.org/jira/browse/COCOON-2108
-COCOON-2100 Retrieving mimeType returned by pipeline executed from Flow
- https://issues.apache.org/jira/browse/COCOON-2100
-COCOON-2040 Union widget does not work with booleanfield set as case widget
- https://issues.apache.org/jira/browse/COCOON-2040
-COCOON-2037 New DynamicGroup widget
- https://issues.apache.org/jira/browse/COCOON-2037
-COCOON-2032 [PATCH] Sort order in paginated repeater
- https://issues.apache.org/jira/browse/COCOON-2032
-COCOON-2030 submit-on-change doesn't work for a multivaluefield with list-type="checkbox"
- https://issues.apache.org/jira/browse/COCOON-2030
-COCOON-2018 Use thread context class loader to load custom binding classes
- https://issues.apache.org/jira/browse/COCOON-2018
-COCOON-2017 More output beautification options for serializers
- https://issues.apache.org/jira/browse/COCOON-2017
-COCOON-2015 Doctype added twice because root element (html) is inlined
- https://issues.apache.org/jira/browse/COCOON-2015
-COCOON-2002 HTML transformer only works with latin-1 characters
- https://issues.apache.org/jira/browse/COCOON-2002
-COCOON-1974 Donating ContextAttributeInputModule
- https://issues.apache.org/jira/browse/COCOON-1974
-COCOON-1973 CaptchaValidator: allow case-insensitive matching
- https://issues.apache.org/jira/browse/COCOON-1973
-COCOON-1964 Redirects inside a block called via the servlet protocol fail
- https://issues.apache.org/jira/browse/COCOON-1964
-COCOON-1963 Add a redirect action to the browser update handler
- https://issues.apache.org/jira/browse/COCOON-1963
-COCOON-1960 Pipeline errors for "generator/reader already set" should provide more information
- https://issues.apache.org/jira/browse/COCOON-1960
-COCOON-1949 [PATCH] load flowscript from file into specified Rhino context object
- https://issues.apache.org/jira/browse/COCOON-1949
-COCOON-1946 [PATCH] - Javaflow Sample errors trying to enhance Javaflow classes and showing cform templates
- https://issues.apache.org/jira/browse/COCOON-1946
-COCOON-1943 [Patch] Parameters in blocks-protocol URIs get decoded too early
- https://issues.apache.org/jira/browse/COCOON-1943
-COCOON-1932 [PATCH] correct styling of disabled suggestion lists
- https://issues.apache.org/jira/browse/COCOON-1932
-COCOON-1929 [PATCH] Reloading classloader in Cocoon 2.2
- https://issues.apache.org/jira/browse/COCOON-1929
-COCOON-1917 Request Encoding problem: multipart/form vs. url encoded
- https://issues.apache.org/jira/browse/COCOON-1917
-COCOON-1915 Nullable value with additional String or XMLizable in JavaSelectionList
- https://issues.apache.org/jira/browse/COCOON-1915
-COCOON-1914 Text as XMLizable in EmptySelectionList
- https://issues.apache.org/jira/browse/COCOON-1914
-COCOON-1899 [PATCH] Cocoon XML:DB Implementation should not depend on Xindice
- https://issues.apache.org/jira/browse/COCOON-1899
-COCOON-1898 [PATCH] XPatch support for maven-cocoon-deployer-plugin
- https://issues.apache.org/jira/browse/COCOON-1898
-COCOON-1893 XML-Binding: Problem creating a new element
- https://issues.apache.org/jira/browse/COCOON-1893
-COCOON-1877 [PATCH] Pageable Repeater
- https://issues.apache.org/jira/browse/COCOON-1877
-COCOON-1870 Lucene block does not store attributes when instructed so
- https://issues.apache.org/jira/browse/COCOON-1870
-COCOON-1846 [PATCH] BooleanField and radio do not send on-value-changed at the rigth time with IE
- https://issues.apache.org/jira/browse/COCOON-1846
-COCOON-1843 LDAPTransformer: add-entry tag doesn't work
- https://issues.apache.org/jira/browse/COCOON-1843
-COCOON-1842 LDAPTransformer: ClassCastException with Binary fields
- https://issues.apache.org/jira/browse/COCOON-1842
-COCOON-1810 [PATCH] JMSEventMessageListener does not work
- https://issues.apache.org/jira/browse/COCOON-1810
-COCOON-1807 Workaround for IE Bug in <button>
- https://issues.apache.org/jira/browse/COCOON-1807
-COCOON-1794 [PATCH] Propagation of namespaces to a repeaters child bindings and implementation of a move-node binding
- https://issues.apache.org/jira/browse/COCOON-1794
-COCOON-1738 double-listbox problem in repeaters
- https://issues.apache.org/jira/browse/COCOON-1738
-COCOON-1726 Implementation of Source that supports conditional GETs
- https://issues.apache.org/jira/browse/COCOON-1726
-COCOON-1717 Use custom cache keys for caching uri coplets using input modules.
- https://issues.apache.org/jira/browse/COCOON-1717
-COCOON-1697 Allow request parameters to be used in "for (var k in h)" kind of Javascript Loops
- https://issues.apache.org/jira/browse/COCOON-1697
-COCOON-1648 Add support for ISO8601 in I18nTransformer and Forms
- https://issues.apache.org/jira/browse/COCOON-1648
-COCOON-1618 [PATCH] SoapGenerator/Serializer for Axis Block
- https://issues.apache.org/jira/browse/COCOON-1618
-COCOON-1611 [PATCH] Add additonal constructor to FormInstance.java to be able to pass a locale
- https://issues.apache.org/jira/browse/COCOON-1611
-COCOON-1603 [PATCH] handling of alternatives in MailTransformer
- https://issues.apache.org/jira/browse/COCOON-1603
-COCOON-1573 Improvement SetAttributeJXPathBinding and Contribution SetNodeValueJXPathBinding
- https://issues.apache.org/jira/browse/COCOON-1573
-COCOON-1556 [PATCH] Add a JXPathConvertor for conversion betwean beans and Strings
- https://issues.apache.org/jira/browse/COCOON-1556
-COCOON-1535 [PATCH] enhancement to {global:} input module: return all sitemap globals
- https://issues.apache.org/jira/browse/COCOON-1535
-COCOON-1527 [PATCH] Cache control logic sheets for XSP to override getKey and getValidity
- https://issues.apache.org/jira/browse/COCOON-1527
-COCOON-1526 [PATCH] processToDOM returns a read-only DOM
- https://issues.apache.org/jira/browse/COCOON-1526
-COCOON-1519 [PATCH] TeeTransformer refactoring
- https://issues.apache.org/jira/browse/COCOON-1519
-COCOON-1508 [PATCH] Avalonize TranscoderFactory
- https://issues.apache.org/jira/browse/COCOON-1508
-COCOON-1506 [PATCH] Manually specifying a mounted sitemap's context
- https://issues.apache.org/jira/browse/COCOON-1506
-COCOON-1488 [PATCH] htmlunit-based testing, needs to be ported to 2.2
- https://issues.apache.org/jira/browse/COCOON-1488
-COCOON-1467 ESQL exception handling problem
- https://issues.apache.org/jira/browse/COCOON-1467
-COCOON-1439 [poi] vertical text orientation and font cache
- https://issues.apache.org/jira/browse/COCOON-1439
-COCOON-1398 New CachingPortletAdapter
- https://issues.apache.org/jira/browse/COCOON-1398
-COCOON-1395 [PATCH] Missing ContextAttributeInputModule
- https://issues.apache.org/jira/browse/COCOON-1395
-COCOON-1394 [PATCH] Implementation of PortletRequest#getQueryString()
- https://issues.apache.org/jira/browse/COCOON-1394
-COCOON-1384 [PATCH] flow redirector should allow explicit 'cocoon:' scheme
- https://issues.apache.org/jira/browse/COCOON-1384
-COCOON-1370 [PATCH] proxy block can now use JTidy and handle multipart POST
- https://issues.apache.org/jira/browse/COCOON-1370
-COCOON-1368 [PATCH] HTTPRequestTransformer
- https://issues.apache.org/jira/browse/COCOON-1368
-COCOON-1362 [PATCH] log4j.xconf should have the same default config as logkit.xconf
- https://issues.apache.org/jira/browse/COCOON-1362
-COCOON-1360 [patch] client side validation for CForms
- https://issues.apache.org/jira/browse/COCOON-1360
-COCOON-1345 [PATCH] Extract convertors into their own block
- https://issues.apache.org/jira/browse/COCOON-1345
-COCOON-1340 [PATCH] lucene block contribution : a AnalyzerManager component
- https://issues.apache.org/jira/browse/COCOON-1340
-COCOON-1337 [PATCH] Suggestion for widget population
- https://issues.apache.org/jira/browse/COCOON-1337
-COCOON-1336 [PATCH] PortletWindowAspect: hiding portlet mode icons and new feature "force-sizable"
- https://issues.apache.org/jira/browse/COCOON-1336
-COCOON-1332 [PATCH] content-length and content-type for portlet ActionRequest
- https://issues.apache.org/jira/browse/COCOON-1332
-COCOON-1329 [PATCH] Fix for cocoon.jar bundled in ear common for portal.war and portlet.war
- https://issues.apache.org/jira/browse/COCOON-1329
-COCOON-1325 [PATCH] commons-fileupload based multipart parser
- https://issues.apache.org/jira/browse/COCOON-1325
-COCOON-1302 [Patch] Word Document Generator
- https://issues.apache.org/jira/browse/COCOON-1302
-COCOON-1295 ParallelContentAggregator, multithreaded aggregating
- https://issues.apache.org/jira/browse/COCOON-1295
-COCOON-1260 [PATCH] MultipartParser can now handle multipart/mixed
- https://issues.apache.org/jira/browse/COCOON-1260
-COCOON-1254 [Patch] OWQLTransformer + RDQLTransformer
- https://issues.apache.org/jira/browse/COCOON-1254
-COCOON-1249 [Patch] XMLDBSource should accept scheme://user:***@host:port/path URIs
- https://issues.apache.org/jira/browse/COCOON-1249
-COCOON-1232 [PATCH] NEW--ModuleDB Action for ORACLE( auto. increment )
- https://issues.apache.org/jira/browse/COCOON-1232
-COCOON-1203 [PATCH] inserver junit testing
- https://issues.apache.org/jira/browse/COCOON-1203
-COCOON-1200 [PATCH] XML CSS engine
- https://issues.apache.org/jira/browse/COCOON-1200
-COCOON-1185 [PATCH] BerkeleyDBStore
- https://issues.apache.org/jira/browse/COCOON-1185
-COCOON-1147 [PATCH] namespace issues with XMLDBTransformer
- https://issues.apache.org/jira/browse/COCOON-1147
-COCOON-1125 [PATCH] Updated CastorTransformer + samples
- https://issues.apache.org/jira/browse/COCOON-1125
-COCOON-1027 [PATCH] CocoonBean add additional features for reprocessing pipelines and interrupt processing
- https://issues.apache.org/jira/browse/COCOON-1027
-COCOON-996 [PATCH] LuceneIndexContentHandler.java produces CLOBs
- https://issues.apache.org/jira/browse/COCOON-996
-COCOON-988 [PATCH] StreamGenerator can't handle multipart request parameters correctly
- https://issues.apache.org/jira/browse/COCOON-988
-COCOON-881 [PATCH] file upload component for usage with flowscript
- https://issues.apache.org/jira/browse/COCOON-881
-COCOON-871 [PATCH] XML posting from SourceWritingTransformer by using an enhanced HTTPClientSource
- https://issues.apache.org/jira/browse/COCOON-871
-COCOON-867 [PATCH] wsinclude and htmlinclude transformers
- https://issues.apache.org/jira/browse/COCOON-867
-COCOON-865 [PATCH] New ResourceLoadAction
- https://issues.apache.org/jira/browse/COCOON-865
-COCOON-844 [PATCH] adding <wd:on-phase> and moving load() and save() to Form.
- https://issues.apache.org/jira/browse/COCOON-844
-COCOON-825 [PATCH] Fix Bug: Better handling of CLOB in esql (get-xml) and handling of Oracle 'temporary lobs'
- https://issues.apache.org/jira/browse/COCOON-825
-COCOON-719 [PATCH] Support for transactions in SQLTransformer
- https://issues.apache.org/jira/browse/COCOON-719
-COCOON-717 [PATCH] Namespace cleanup in HTMLSerializer
- https://issues.apache.org/jira/browse/COCOON-717
-COCOON-665 [PATCH] HSSFSerializer Support for FreezePane
- https://issues.apache.org/jira/browse/COCOON-665
-
-You may edit this subscription at:
-https://issues.apache.org/jira/secure/FilterSubscription!default.jspa?subId=10311&filterId=12310771
-
-
-From dev-return-102530-apmail-cocoon-dev-archive=***@cocoon.apache.org Thu Sep 09 21:09:56 2010
-Return-Path: <dev-return-102530-apmail-cocoon-dev-archive=***@cocoon.apache.org>
-Delivered-To: apmail-cocoon-dev-***@www.apache.org
-Received: (qmail 92717 invoked from network); 9 Sep 2010 21:09:55 -0000
-Received: from unknown (HELO mail.apache.org) (140.211.11.3)
- by 140.211.11.9 with SMTP; 9 Sep 2010 21:09:55 -0000
-Received: (qmail 28372 invoked by uid 500); 9 Sep 2010 21:09:55 -0000
-Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
-Received: (qmail 28206 invoked by uid 500); 9 Sep 2010 21:09:54 -0000
-Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
-Precedence: bulk
-list-help: <mailto:dev-***@cocoon.apache.org>
-list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
-List-Post: <mailto:***@cocoon.apache.org>
-Reply-To: ***@cocoon.apache.org
-List-Id: <dev.cocoon.apache.org>
-Delivered-To: mailing list ***@cocoon.apache.org
-Received: (qmail 28199 invoked by uid 99); 9 Sep 2010 21:09:53 -0000
-Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
- by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:09:53 +0000
-X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
- tests=ALL_TRUSTED
-X-Spam-Check-By: apache.org
-Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
- by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:09:53 +0000
-Received: from thor (localhost [127.0.0.1])
- by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o89L9WIT025382
- for <***@cocoon.apache.org>; Thu, 9 Sep 2010 21:09:33 GMT
-Message-ID: <***@thor>
-Date: Thu, 9 Sep 2010 17:09:32 -0400 (EDT)
-From: "Douglas Hurbon (JIRA)" <***@apache.org>
-To: ***@cocoon.apache.org
-Subject: [jira] Created: (COCOON-2301) Cocoon Cron Block Configurable
- Clustering
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-
-Cocoon Cron Block Configurable Clustering
------------------------------------------
-
- Key: COCOON-2301
- URL: https://issues.apache.org/jira/browse/COCOON-2301
- Project: Cocoon
- Issue Type: Improvement
- Components: Blocks: Cron
- Affects Versions: 2.1.11
- Reporter: Douglas Hurbon
-
-
-The QuartzJobScheduler is modified to respond to a configuration parameter: clustered=true so that it can correctly use a clustered job store when using cocoon in a cluster.
-
---
-This message is automatically generated by JIRA.
--
-You can reply to this email to add a comment to the issue online.
-
-
-From dev-return-102531-apmail-cocoon-dev-archive=***@cocoon.apache.org Thu Sep 09 21:12:00 2010
-Return-Path: <dev-return-102531-apmail-cocoon-dev-archive=***@cocoon.apache.org>
-Delivered-To: apmail-cocoon-dev-***@www.apache.org
-Received: (qmail 94093 invoked from network); 9 Sep 2010 21:12:00 -0000
-Received: from unknown (HELO mail.apache.org) (140.211.11.3)
- by 140.211.11.9 with SMTP; 9 Sep 2010 21:12:00 -0000
-Received: (qmail 32222 invoked by uid 500); 9 Sep 2010 21:11:59 -0000
-Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
-Received: (qmail 31836 invoked by uid 500); 9 Sep 2010 21:11:58 -0000
-Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
-Precedence: bulk
-list-help: <mailto:dev-***@cocoon.apache.org>
-list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
-List-Post: <mailto:***@cocoon.apache.org>
-Reply-To: ***@cocoon.apache.org
-List-Id: <dev.cocoon.apache.org>
-Delivered-To: mailing list ***@cocoon.apache.org
-Received: (qmail 31829 invoked by uid 99); 9 Sep 2010 21:11:58 -0000
-Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
- by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:11:58 +0000
-X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
- tests=ALL_TRUSTED
-X-Spam-Check-By: apache.org
-Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
- by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:11:58 +0000
-Received: from thor (localhost [127.0.0.1])
- by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o89LBcLv025458
- for <***@cocoon.apache.org>; Thu, 9 Sep 2010 21:11:38 GMT
-Message-ID: <***@thor>
-Date: Thu, 9 Sep 2010 17:11:38 -0400 (EDT)
-From: "Douglas Hurbon (JIRA)" <***@apache.org>
-To: ***@cocoon.apache.org
-Subject: [jira] Updated: (COCOON-2301) Cocoon Cron Block Configurable
- Clustering
-In-Reply-To: <***@thor>
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-
-
- [ https://issues.apache.org/jira/browse/COCOON-2301?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
-
-Douglas Hurbon updated COCOON-2301:
------------------------------------
-
- Attachment: QuartzJobScheduler.patch
-
-Patch to make cocoon_2_1_x/src/blocks/cron/java/org/apache/cocoon/components/cron/QuartzJobScheduler.java respond correctly to configuration for clustering.
-
-> Cocoon Cron Block Configurable Clustering
-> -----------------------------------------
->
-> Key: COCOON-2301
-> URL: https://issues.apache.org/jira/browse/COCOON-2301
-> Project: Cocoon
-> Issue Type: Improvement
-> Components: Blocks: Cron
-> Affects Versions: 2.1.11
-> Reporter: Douglas Hurbon
-> Attachments: QuartzJobScheduler.patch
->
->
-> The QuartzJobScheduler is modified to respond to a configuration parameter: clustered=true so that it can correctly use a clustered job store when using cocoon in a cluster.
-
---
-This message is automatically generated by JIRA.
--
-You can reply to this email to add a comment to the issue online.
-
-
-From dev-return-102532-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 15 14:42:02 2010
-Return-Path: <dev-return-102532-apmail-cocoon-dev-archive=***@cocoon.apache.org>
-Delivered-To: apmail-cocoon-dev-***@www.apache.org
-Received: (qmail 34078 invoked from network); 15 Sep 2010 14:42:01 -0000
-Received: from unknown (HELO mail.apache.org) (140.211.11.3)
- by 140.211.11.9 with SMTP; 15 Sep 2010 14:42:01 -0000
-Received: (qmail 5328 invoked by uid 500); 15 Sep 2010 14:42:01 -0000
-Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
-Received: (qmail 4960 invoked by uid 500); 15 Sep 2010 14:41:57 -0000
-Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
-Precedence: bulk
-list-help: <mailto:dev-***@cocoon.apache.org>
-list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
-List-Post: <mailto:***@cocoon.apache.org>
-Reply-To: ***@cocoon.apache.org
-List-Id: <dev.cocoon.apache.org>
-Delivered-To: mailing list ***@cocoon.apache.org
-Received: (qmail 4952 invoked by uid 99); 15 Sep 2010 14:41:56 -0000
-Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 15 Sep 2010 14:41:56 +0000
-X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
- tests=ALL_TRUSTED
-X-Spam-Check-By: apache.org
-Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
- by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 15 Sep 2010 14:41:54 +0000
-Received: from thor (localhost [127.0.0.1])
- by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o8FEfYjF006141
- for <***@cocoon.apache.org>; Wed, 15 Sep 2010 14:41:34 GMT
-Message-ID: <***@thor>
-Date: Wed, 15 Sep 2010 10:41:34 -0400 (EDT)
-From: ***@apache.org
-To: ***@cocoon.apache.org
-Subject: [jira] Subscription: COCOON-open-with-patch
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-
-Issue Subscription
-Filter: COCOON-open-with-patch (115 issues)
-Subscriber: cocoon
-
-Key Summary
-COCOON-2301 Cocoon Cron Block Configurable Clustering
- https://issues.apache.org/jira/browse/COCOON-2301
-COCOON-2300 jboss-5.1.0.GA vfszip protocol in CharsetFactory
- https://issues.apache.org/jira/browse/COCOON-2300
-COCOON-2298 IncludeTransformer does not handle multi-valued parameters
- https://issues.apache.org/jira/browse/COCOON-2298
-COCOON-2297 Character encoding does not follow JTidy properties
- https://issues.apache.org/jira/browse/COCOON-2297
-COCOON-2296 [PATCH] Make flowscript work with Commons JXPath 1.3
- https://issues.apache.org/jira/browse/COCOON-2296
-COCOON-2295 integrating FOP-1.0 into Cocoon-2.1.12-dev
- https://issues.apache.org/jira/browse/COCOON-2295
-COCOON-2294 Wrong version number for cocoon-serializers-impl in parent pom for revision 964648
- https://issues.apache.org/jira/browse/COCOON-2294
-COCOON-2290 CLONE - Add a read method to the SitemapComponentTestCase
- https://issues.apache.org/jira/browse/COCOON-2290
-COCOON-2288 Allow usage of SLF4J for traces
- https://issues.apache.org/jira/browse/COCOON-2288
-COCOON-2281 "Communication tools that we use" link to dev mailing list archive comes out at user mailing list archive
- https://issues.apache.org/jira/browse/COCOON-2281
-COCOON-2268 To extend the image reader we need to change the visibility to the parameter of the ImageReader
- https://issues.apache.org/jira/browse/COCOON-2268
-COCOON-2262 container.refresh() is called before embeddedServlet.init()
- https://issues.apache.org/jira/browse/COCOON-2262
-COCOON-2260 wrong parent version in pom of cocoon-flowscript-impl
- https://issues.apache.org/jira/browse/COCOON-2260
-COCOON-2249 XHTMLSerializer uses entity references " and ' which cause JavaScript parse errors
- https://issues.apache.org/jira/browse/COCOON-2249
-COCOON-2246 HttpRequest should handle encoding in getParameter and getParameterValues in the same way
- https://issues.apache.org/jira/browse/COCOON-2246
-COCOON-2233 Update archetypes to current trunk artifact versions
- https://issues.apache.org/jira/browse/COCOON-2233
-COCOON-2222 Add SaxParser configuration properties
- https://issues.apache.org/jira/browse/COCOON-2222
-COCOON-2216 IncludeCacheManager can not perfom parallel includes
- https://issues.apache.org/jira/browse/COCOON-2216
-COCOON-2212 jx:attribute does not check name is correct before proceeding
- https://issues.apache.org/jira/browse/COCOON-2212
-COCOON-2197 Making the cocoon-auth-block acegi-security-sample work
- https://issues.apache.org/jira/browse/COCOON-2197
-COCOON-2173 AbstractCachingProcessingPipeline: Two requests can deadlock each other
- https://issues.apache.org/jira/browse/COCOON-2173
-COCOON-2162 [PATCH] Fix for Paginator when accessing out of bounds Pagination page
- https://issues.apache.org/jira/browse/COCOON-2162
-COCOON-2137 XSD Schemas for CForms Development
- https://issues.apache.org/jira/browse/COCOON-2137
-COCOON-2114 fix sorting in TraversableGenerator
- https://issues.apache.org/jira/browse/COCOON-2114
-COCOON-2108 xmodule:flow-attr Does not accept document objects
- https://issues.apache.org/jira/browse/COCOON-2108
-COCOON-2100 Retrieving mimeType returned by pipeline executed from Flow
- https://issues.apache.org/jira/browse/COCOON-2100
-COCOON-2040 Union widget does not work with booleanfield set as case widget
- https://issues.apache.org/jira/browse/COCOON-2040
-COCOON-2037 New DynamicGroup widget
- https://issues.apache.org/jira/browse/COCOON-2037
-COCOON-2032 [PATCH] Sort order in paginated repeater
- https://issues.apache.org/jira/browse/COCOON-2032
-COCOON-2030 submit-on-change doesn't work for a multivaluefield with list-type="checkbox"
- https://issues.apache.org/jira/browse/COCOON-2030
-COCOON-2018 Use thread context class loader to load custom binding classes
- https://issues.apache.org/jira/browse/COCOON-2018
-COCOON-2017 More output beautification options for serializers
- https://issues.apache.org/jira/browse/COCOON-2017
-COCOON-2015 Doctype added twice because root element (html) is inlined
- https://issues.apache.org/jira/browse/COCOON-2015
-COCOON-2002 HTML transformer only works with latin-1 characters
- https://issues.apache.org/jira/browse/COCOON-2002
-COCOON-1974 Donating ContextAttributeInputModule
- https://issues.apache.org/jira/browse/COCOON-1974
-COCOON-1973 CaptchaValidator: allow case-insensitive matching
- https://issues.apache.org/jira/browse/COCOON-1973
-COCOON-1964 Redirects inside a block called via the servlet protocol fail
- https://issues.apache.org/jira/browse/COCOON-1964
-COCOON-1963 Add a redirect action to the browser update handler
- https://issues.apache.org/jira/browse/COCOON-1963
-COCOON-1960 Pipeline errors for "generator/reader already set" should provide more information
- https://issues.apache.org/jira/browse/COCOON-1960
-COCOON-1949 [PATCH] load flowscript from file into specified Rhino context object
- https://issues.apache.org/jira/browse/COCOON-1949
-COCOON-1946 [PATCH] - Javaflow Sample errors trying to enhance Javaflow classes and showing cform templates
- https://issues.apache.org/jira/browse/COCOON-1946
-COCOON-1943 [Patch] Parameters in blocks-protocol URIs get decoded too early
- https://issues.apache.org/jira/browse/COCOON-1943
-COCOON-1932 [PATCH] correct styling of disabled suggestion lists
- https://issues.apache.org/jira/browse/COCOON-1932
-COCOON-1929 [PATCH] Reloading classloader in Cocoon 2.2
- https://issues.apache.org/jira/browse/COCOON-1929
-COCOON-1917 Request Encoding problem: multipart/form vs. url encoded
- https://issues.apache.org/jira/browse/COCOON-1917
-COCOON-1915 Nullable value with additional String or XMLizable in JavaSelectionList
- https://issues.apache.org/jira/browse/COCOON-1915
-COCOON-1914 Text as XMLizable in EmptySelectionList
- https://issues.apache.org/jira/browse/COCOON-1914
-COCOON-1899 [PATCH] Cocoon XML:DB Implementation should not depend on Xindice
- https://issues.apache.org/jira/browse/COCOON-1899
-COCOON-1898 [PATCH] XPatch support for maven-cocoon-deployer-plugin
- https://issues.apache.org/jira/browse/COCOON-1898
-COCOON-1893 XML-Binding: Problem creating a new element
- https://issues.apache.org/jira/browse/COCOON-1893
-COCOON-1877 [PATCH] Pageable Repeater
- https://issues.apache.org/jira/browse/COCOON-1877
-COCOON-1870 Lucene block does not store attributes when instructed so
- https://issues.apache.org/jira/browse/COCOON-1870
-COCOON-1846 [PATCH] BooleanField and radio do not send on-value-changed at the rigth time with IE
- https://issues.apache.org/jira/browse/COCOON-1846
-COCOON-1843 LDAPTransformer: add-entry tag doesn't work
- https://issues.apache.org/jira/browse/COCOON-1843
-COCOON-1842 LDAPTransformer: ClassCastException with Binary fields
- https://issues.apache.org/jira/browse/COCOON-1842
-COCOON-1810 [PATCH] JMSEventMessageListener does not work
- https://issues.apache.org/jira/browse/COCOON-1810
-COCOON-1807 Workaround for IE Bug in <button>
- https://issues.apache.org/jira/browse/COCOON-1807
-COCOON-1794 [PATCH] Propagation of namespaces to a repeaters child bindings and implementation of a move-node binding
- https://issues.apache.org/jira/browse/COCOON-1794
-COCOON-1738 double-listbox problem in repeaters
- https://issues.apache.org/jira/browse/COCOON-1738
-COCOON-1726 Implementation of Source that supports conditional GETs
- https://issues.apache.org/jira/browse/COCOON-1726
-COCOON-1717 Use custom cache keys for caching uri coplets using input modules.
- https://issues.apache.org/jira/browse/COCOON-1717
-COCOON-1697 Allow request parameters to be used in "for (var k in h)" kind of Javascript Loops
- https://issues.apache.org/jira/browse/COCOON-1697
-COCOON-1648 Add support for ISO8601 in I18nTransformer and Forms
- https://issues.apache.org/jira/browse/COCOON-1648
-COCOON-1618 [PATCH] SoapGenerator/Serializer for Axis Block
- https://issues.apache.org/jira/browse/COCOON-1618
-COCOON-1611 [PATCH] Add additonal constructor to FormInstance.java to be able to pass a locale
- https://issues.apache.org/jira/browse/COCOON-1611
-COCOON-1603 [PATCH] handling of alternatives in MailTransformer
- https://issues.apache.org/jira/browse/COCOON-1603
-COCOON-1573 Improvement SetAttributeJXPathBinding and Contribution SetNodeValueJXPathBinding
- https://issues.apache.org/jira/browse/COCOON-1573
-COCOON-1556 [PATCH] Add a JXPathConvertor for conversion betwean beans and Strings
- https://issues.apache.org/jira/browse/COCOON-1556
-COCOON-1535 [PATCH] enhancement to {global:} input module: return all sitemap globals
- https://issues.apache.org/jira/browse/COCOON-1535
-COCOON-1527 [PATCH] Cache control logic sheets for XSP to override getKey and getValidity
- https://issues.apache.org/jira/browse/COCOON-1527
-COCOON-1526 [PATCH] processToDOM returns a read-only DOM
- https://issues.apache.org/jira/browse/COCOON-1526
-COCOON-1519 [PATCH] TeeTransformer refactoring
- https://issues.apache.org/jira/browse/COCOON-1519
-COCOON-1508 [PATCH] Avalonize TranscoderFactory
- https://issues.apache.org/jira/browse/COCOON-1508
-COCOON-1506 [PATCH] Manually specifying a mounted sitemap's context
- https://issues.apache.org/jira/browse/COCOON-1506
-COCOON-1488 [PATCH] htmlunit-based testing, needs to be ported to 2.2
- https://issues.apache.org/jira/browse/COCOON-1488
-COCOON-1467 ESQL exception handling problem
- https://issues.apache.org/jira/browse/COCOON-1467
-COCOON-1439 [poi] vertical text orientation and font cache
- https://issues.apache.org/jira/browse/COCOON-1439
-COCOON-1398 New CachingPortletAdapter
- https://issues.apache.org/jira/browse/COCOON-1398
-COCOON-1395 [PATCH] Missing ContextAttributeInputModule
- https://issues.apache.org/jira/browse/COCOON-1395
-COCOON-1394 [PATCH] Implementation of PortletRequest#getQueryString()
- https://issues.apache.org/jira/browse/COCOON-1394
-COCOON-1384 [PATCH] flow redirector should allow explicit 'cocoon:' scheme
- https://issues.apache.org/jira/browse/COCOON-1384
-COCOON-1370 [PATCH] proxy block can now use JTidy and handle multipart POST
- https://issues.apache.org/jira/browse/COCOON-1370
-COCOON-1368 [PATCH] HTTPRequestTransformer
- https://issues.apache.org/jira/browse/COCOON-1368
-COCOON-1362 [PATCH] log4j.xconf should have the same default config as logkit.xconf
- https://issues.apache.org/jira/browse/COCOON-1362
-COCOON-1360 [patch] client side validation for CForms
- https://issues.apache.org/jira/browse/COCOON-1360
-COCOON-1345 [PATCH] Extract convertors into their own block
- https://issues.apache.org/jira/browse/COCOON-1345
-COCOON-1340 [PATCH] lucene block contribution : a AnalyzerManager component
- https://issues.apache.org/jira/browse/COCOON-1340
-COCOON-1337 [PATCH] Suggestion for widget population
- https://issues.apache.org/jira/browse/COCOON-1337
-COCOON-1336 [PATCH] PortletWindowAspect: hiding portlet mode icons and new feature "force-sizable"
- https://issues.apache.org/jira/browse/COCOON-1336
-COCOON-1332 [PATCH] content-length and content-type for portlet ActionRequest
- https://issues.apache.org/jira/browse/COCOON-1332
-COCOON-1329 [PATCH] Fix for cocoon.jar bundled in ear common for portal.war and portlet.war
- https://issues.apache.org/jira/browse/COCOON-1329
-COCOON-1325 [PATCH] commons-fileupload based multipart parser
- https://issues.apache.org/jira/browse/COCOON-1325
-COCOON-1302 [Patch] Word Document Generator
- https://issues.apache.org/jira/browse/COCOON-1302
-COCOON-1295 ParallelContentAggregator, multithreaded aggregating
- https://issues.apache.org/jira/browse/COCOON-1295
-COCOON-1260 [PATCH] MultipartParser can now handle multipart/mixed
- https://issues.apache.org/jira/browse/COCOON-1260
-COCOON-1254 [Patch] OWQLTransformer + RDQLTransformer
- https://issues.apache.org/jira/browse/COCOON-1254
-COCOON-1249 [Patch] XMLDBSource should accept scheme://user:***@host:port/path URIs
- https://issues.apache.org/jira/browse/COCOON-1249
-COCOON-1232 [PATCH] NEW--ModuleDB Action for ORACLE( auto. increment )
- https://issues.apache.org/jira/browse/COCOON-1232
-COCOON-1203 [PATCH] inserver junit testing
- https://issues.apache.org/jira/browse/COCOON-1203
-COCOON-1200 [PATCH] XML CSS engine
- https://issues.apache.org/jira/browse/COCOON-1200
-COCOON-1185 [PATCH] BerkeleyDBStore
- https://issues.apache.org/jira/browse/COCOON-1185
-COCOON-1147 [PATCH] namespace issues with XMLDBTransformer
- https://issues.apache.org/jira/browse/COCOON-1147
-COCOON-1125 [PATCH] Updated CastorTransformer + samples
- https://issues.apache.org/jira/browse/COCOON-1125
-COCOON-1027 [PATCH] CocoonBean add additional features for reprocessing pipelines and interrupt processing
- https://issues.apache.org/jira/browse/COCOON-1027
-COCOON-996 [PATCH] LuceneIndexContentHandler.java produces CLOBs
- https://issues.apache.org/jira/browse/COCOON-996
-COCOON-988 [PATCH] StreamGenerator can't handle multipart request parameters correctly
- https://issues.apache.org/jira/browse/COCOON-988
-COCOON-881 [PATCH] file upload component for usage with flowscript
- https://issues.apache.org/jira/browse/COCOON-881
-COCOON-871 [PATCH] XML posting from SourceWritingTransformer by using an enhanced HTTPClientSource
- https://issues.apache.org/jira/browse/COCOON-871
-COCOON-867 [PATCH] wsinclude and htmlinclude transformers
- https://issues.apache.org/jira/browse/COCOON-867
-COCOON-865 [PATCH] New ResourceLoadAction
- https://issues.apache.org/jira/browse/COCOON-865
-COCOON-844 [PATCH] adding <wd:on-phase> and moving load() and save() to Form.
- https://issues.apache.org/jira/browse/COCOON-844
-COCOON-825 [PATCH] Fix Bug: Better handling of CLOB in esql (get-xml) and handling of Oracle 'temporary lobs'
- https://issues.apache.org/jira/browse/COCOON-825
-COCOON-719 [PATCH] Support for transactions in SQLTransformer
- https://issues.apache.org/jira/browse/COCOON-719
-COCOON-717 [PATCH] Namespace cleanup in HTMLSerializer
- https://issues.apache.org/jira/browse/COCOON-717
-COCOON-665 [PATCH] HSSFSerializer Support for FreezePane
- https://issues.apache.org/jira/browse/COCOON-665
-
-You may edit this subscription at:
-https://issues.apache.org/jira/secure/FilterSubscription!default.jspa?subId=10311&filterId=12310771
-
-
-From dev-return-102533-apmail-cocoon-dev-archive=***@cocoon.apache.org Sat Sep 18 00:15:21 2010
-Return-Path: <dev-return-102533-apmail-cocoon-dev-archive=***@cocoon.apache.org>
-Delivered-To: apmail-cocoon-dev-***@www.apache.org
-Received: (qmail 70276 invoked from network); 18 Sep 2010 00:15:21 -0000
-Received: from unknown (HELO mail.apache.org) (140.211.11.3)
- by 140.211.11.9 with SMTP; 18 Sep 2010 00:15:21 -0000
-Received: (qmail 17738 invoked by uid 500); 18 Sep 2010 00:15:20 -0000
-Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
-Received: (qmail 17581 invoked by uid 500); 18 Sep 2010 00:15:19 -0000
-Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
-Precedence: bulk
-list-help: <mailto:dev-***@cocoon.apache.org>
-list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
-List-Post: <mailto:***@cocoon.apache.org>
-Reply-To: ***@cocoon.apache.org
-List-Id: <dev.cocoon.apache.org>
-Delivered-To: mailing list ***@cocoon.apache.org
-Received: (qmail 17574 invoked by uid 99); 18 Sep 2010 00:15:19 -0000
-Received: from Unknown (HELO nike.apache.org) (192.87.106.230)
- by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 18 Sep 2010 00:15:19 +0000
-X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
- tests=ALL_TRUSTED
-X-Spam-Check-By: apache.org
-Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
- by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 18 Sep 2010 00:15:00 +0000
-Received: from thor (localhost [127.0.0.1])
- by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o8I0EcCI022463
- for <***@cocoon.apache.org>; Sat, 18 Sep 2010 00:14:39 GMT
-Message-ID: <***@thor>
-Date: Fri, 17 Sep 2010 20:14:38 -0400 (EDT)
-From: "Florent ANDRE (JIRA)" <***@apache.org>
-To: ***@cocoon.apache.org
-Subject: [jira] Created: (COCOON-2302) C2.2 : unable to find daisy-..-1.5
- jars in rev 959219
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-X-Virus-Checked: Checked by ClamAV on apache.org
-
-C2.2 : unable to find daisy-..-1.5 jars in rev 959219
------------------------------------------------------
-
- Key: COCOON-2302
- URL: https://issues.apache.org/jira/browse/COCOON-2302
- Project: Cocoon
- Issue Type: Bug
- Components: - Build System: Maven
- Affects Versions: 2.2-dev (Current SVN)
- Reporter: Florent ANDRE
-
-
-Hi,
-
-On a fresh co of cocoon trunk give me this errors when mvn install.
-
-Find this repository (http://daisycms.org/maven/maven2/dev/), but there is just 2.5 versions of libs.
-
-Ugrade dependencies to 2.5 or another 1.5 repository ?
-
-Thanks.
-
-INFO] Unable to find resource 'daisy:daisy-util:jar:1.5-dev' in repository gkossakowski-maven2 (http://people.apache.org/~gkossakowski/maven2/repository)
-[INFO] ------------------------------------------------------------------------
-[ERROR] BUILD ERROR
-[INFO] ------------------------------------------------------------------------
-[INFO] Failed to resolve artifact.
-
-Missing:
-----------
-1) daisy:daisy-repository-api:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-repository-api:jar:1.5-dev
-
-2) nekodtd:nekodtd:jar:0.1.11
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=nekodtd -DartifactId=nekodtd -Dversion=0.1.11 -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=nekodtd -DartifactId=nekodtd -Dversion=0.1.11 -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) nekodtd:nekodtd:jar:0.1.11
-
-3) daisy:daisy-repository-xmlschema-bindings:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-xmlschema-bindings -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-xmlschema-bindings -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-repository-xmlschema-bindings:jar:1.5-dev
-
-4) daisy:daisy-repository-client-impl:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-client-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-client-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-repository-client-impl:jar:1.5-dev
-
-5) daisy:daisy-repository-common-impl:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-common-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-common-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-repository-common-impl:jar:1.5-dev
-
-6) daisy:daisy-repository-spi:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-spi -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-spi -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-repository-spi:jar:1.5-dev
-
-7) daisy:daisy-jmsclient-api:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-jmsclient-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-jmsclient-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-jmsclient-api:jar:1.5-dev
-
-8) daisy:daisy-htmlcleaner:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-htmlcleaner -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-htmlcleaner -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-htmlcleaner:jar:1.5-dev
-
-9) daisy:daisy-util:jar:1.5-dev
-
- Try downloading the file manually from the project website.
-
- Then, install it using the command:
- mvn install:install-file -DgroupId=daisy -DartifactId=daisy-util -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
-
- Alternatively, if you host your own repository you can deploy the file there:
- mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-util -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
-
- Path to dependency:
- 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
- 2) daisy:daisy-util:jar:1.5-dev
-
-----------
-9 required artifacts are missing.
-
-for artifact:
-
-org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
-
-from the specified remote repositories:
- apache.snapshots (http://people.apache.org/repo/m2-snapshot-repository),
- central (http://repo1.maven.org/maven2),
- maven-snapshot (http://snapshots.maven.codehaus.org/maven2/),
- cocoondev (http://cocoondev.org/repository),
- gkossakowski-maven2 (http://people.apache.org/~gkossakowski/maven2/repository)
-
-
-
---
-This message is automatically generated by JIRA.
--
-You can reply to this email to add a comment to the issue online.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/pom.xml
----------------------------------------------------------------------
diff --git a/math-scala/pom.xml b/math-scala/pom.xml
deleted file mode 100644
index 0d74e32..0000000
--- a/math-scala/pom.xml
+++ /dev/null
@@ -1,244 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout</artifactId>
- <version>0.13.1-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>mahout-math-scala_2.10</artifactId>
- <name>Mahout Math Scala bindings</name>
- <description>High performance scientific and technical computing data structures and methods,
- mostly based on CERN's
- Colt Java API
- </description>
-
- <packaging>jar</packaging>
-
- <build>
-
- <plugins>
- 
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <goals>
- <goal>test-jar</goal>
- </goals>
- <phase>package</phase>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <artifactId>maven-javadoc-plugin</artifactId>
- </plugin>
-
- <plugin>
- <artifactId>maven-source-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>net.alchim31.maven</groupId>
- <artifactId>scala-maven-plugin</artifactId>
- <executions>
- <execution>
- <id>add-scala-sources</id>
- <phase>initialize</phase>
- <goals>
- <goal>add-source</goal>
- </goals>
- </execution>
- <execution>
- <id>scala-compile</id>
- <phase>process-resources</phase>
- <goals>
- <goal>compile</goal>
- </goals>
- </execution>
- <execution>
- <id>scala-test-compile</id>
- <phase>process-test-resources</phase>
- <goals>
- <goal>testCompile</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- 
- <plugin>
- <artifactId>maven-antrun-plugin</artifactId>
- <version>1.4</version>
- <executions>
- <execution>
- <id>copy</id>
- <phase>package</phase>
- <configuration>
- <tasks>
- <copy file="target/mahout-math-scala_${scala.compat.version}-${version}.jar" tofile="../mahout-math-scala_${scala.compat.version}-${version}.jar" />
- </tasks>
- </configuration>
- <goals>
- <goal>run</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- 
-
- 
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <configuration>
- <skipTests>true</skipTests>
- </configuration>
- </plugin>
- 
- <plugin>
- <groupId>org.scalatest</groupId>
- <artifactId>scalatest-maven-plugin</artifactId>
- <configuration>
- <argLine>-Xmx4g</argLine>
- </configuration>
- <executions>
- <execution>
- <id>test</id>
- <goals>
- <goal>test</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- 
- <plugin>
- <artifactId>maven-clean-plugin</artifactId>
- <version>3.0.0</version>
- <configuration>
- <filesets>
- <fileset>
- <directory>../</directory>
- <includes>
- <include>mahout-math-scala*.jar</include>
- </includes>
- <followSymlinks>false</followSymlinks>
- </fileset>
- </filesets>
- </configuration>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
-
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.esotericsoftware.kryo</groupId>
- <artifactId>kryo</artifactId>
- <version>2.24.0</version>
- </dependency>
-
- 
- <dependency>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.github.scopt</groupId>
- <artifactId>scopt_${scala.compat.version}</artifactId>
- <version>3.3.0</version>
- </dependency>
-
-
- 
- <dependency>
- <groupId>org.scalatest</groupId>
- <artifactId>scalatest_${scala.compat.version}</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-reflect</artifactId>
- <version>${scala.version}</version>
- </dependency>
-
- </dependencies>
- <profiles>
- <profile>
- <id>mahout-release</id>
- <build>
- <plugins>
- <plugin>
- <groupId>net.alchim31.maven</groupId>
- <artifactId>scala-maven-plugin</artifactId>
- <executions>
- <execution>
- <id>generate-scaladoc</id>
- <goals>
- <goal>doc</goal>
- </goals>
- </execution>
- <execution>
- <id>attach-scaladoc-jar</id>
- <goals>
- <goal>doc-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
- </profile>
- <profile>
- <id>travis</id>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <configuration>
- 
- <argLine>-Xmx3g</argLine>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-failsafe-plugin</artifactId>
- <configuration>
- 
- <argLine>-Xmx3g</argLine>
- </configuration>
- </plugin>
- </plugins>
- </build>
- </profile>
- </profiles>
-</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala
deleted file mode 100644
index 6f8ecb3..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-package org.apache.mahout.classifier.naivebayes
-
-import org.apache.mahout.math.Vector
-import scala.collection.JavaConversions._
-
-/**
- * Abstract Classifier base for Complentary and Standard Classifiers
- * @param nbModel a trained NBModel
- */
-abstract class AbstractNBClassifier(nbModel: NBModel) extends java.io.Serializable {
-
- // Trained Naive Bayes Model
- val model = nbModel
-
- /** scoring method for standard and complementary classifiers */
- protected def getScoreForLabelFeature(label: Int, feature: Int): Double
-
- /** getter for model */
- protected def getModel: NBModel= {
- model
- }
-
- /**
- * Compute the score for a Vector of weighted TF-IDF featured
- * @param label Label to be scored
- * @param instance Vector of weights to be calculate score
- * @return score for this Label
- */
- protected def getScoreForLabelInstance(label: Int, instance: Vector): Double = {
- var result: Double = 0.0
- for (e <- instance.nonZeroes) {
- result += e.get * getScoreForLabelFeature(label, e.index)
- }
- result
- }
-
- /** number of categories the model has been trained on */
- def numCategories: Int = {
- model.numLabels
- }
-
- /**
- * get a scoring vector for a vector of TF of TF-IDF weights
- * @param instance vector of TF of TF-IDF weights to be classified
- * @return a vector of scores.
- */
- def classifyFull(instance: Vector): Vector = {
- classifyFull(model.createScoringVector, instance)
- }
-
- /** helper method for classifyFull(Vector) */
- def classifyFull(r: Vector, instance: Vector): Vector = {
- var label: Int = 0
- for (label <- 0 until model.numLabels) {
- r.setQuick(label, getScoreForLabelInstance(label, instance))
- }
- r
- }
-}
-
-/**
- * Standard Multinomial Naive Bayes Classifier
- * @param nbModel a trained NBModel
- */
-class StandardNBClassifier(nbModel: NBModel) extends AbstractNBClassifier(nbModel: NBModel) with java.io.Serializable{
- override def getScoreForLabelFeature(label: Int, feature: Int): Double = {
- val model: NBModel = getModel
- StandardNBClassifier.computeWeight(model.weight(label, feature), model.labelWeight(label), model.alphaI, model.numFeatures)
- }
-}
-
-/** helper object for StandardNBClassifier */
-object StandardNBClassifier extends java.io.Serializable {
- /** Compute Standard Multinomial Naive Bayes Weights See Rennie et. al. Section 2.1 */
- def computeWeight(featureLabelWeight: Double, labelWeight: Double, alphaI: Double, numFeatures: Double): Double = {
- val numerator: Double = featureLabelWeight + alphaI
- val denominator: Double = labelWeight + alphaI * numFeatures
- Math.log(numerator / denominator)
- }
-}
-
-/**
- * Complementary Naive Bayes Classifier
- * @param nbModel a trained NBModel
- */
-class ComplementaryNBClassifier(nbModel: NBModel) extends AbstractNBClassifier(nbModel: NBModel) with java.io.Serializable {
- override def getScoreForLabelFeature(label: Int, feature: Int): Double = {
- val model: NBModel = getModel
- val weight: Double = ComplementaryNBClassifier.computeWeight(model.featureWeight(feature), model.weight(label, feature), model.totalWeightSum, model.labelWeight(label), model.alphaI, model.numFeatures)
- weight / model.thetaNormalizer(label)
- }
-}
-
-/** helper object for ComplementaryNBClassifier */
-object ComplementaryNBClassifier extends java.io.Serializable {
-
- /** Compute Complementary weights See Rennie et. al. Section 3.1 */
- def computeWeight(featureWeight: Double, featureLabelWeight: Double, totalWeight: Double, labelWeight: Double, alphaI: Double, numFeatures: Double): Double = {
- val numerator: Double = featureWeight - featureLabelWeight + alphaI
- val denominator: Double = totalWeight - labelWeight + alphaI * numFeatures
- -Math.log(numerator / denominator)
- }
-}

r***@apache.org

2018-06-27 14:51:58 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
deleted file mode 100644
index bed4640..0000000
--- a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.Stringifier;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.GenericsUtil;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.text.wikipedia.WikipediaMapper;
-import org.apache.mahout.text.wikipedia.XmlInputFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Create and run the Wikipedia Dataset Creator.
- */
-public final class WikipediaToSequenceFile {
-
- private static final Logger log = LoggerFactory.getLogger(WikipediaToSequenceFile.class);
-
- private WikipediaToSequenceFile() { }
-
- /**
- * Takes in two arguments:
- * <ol>
- * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
- * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
- * {@link org.apache.hadoop.io.SequenceFile}</li>
- * </ol>
- */
- public static void main(String[] args) throws IOException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
-
- Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();
-
- Option categoriesOpt = obuilder.withLongName("categories").withArgument(
- abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription(
- "Location of the categories file. One entry per line. "
- + "Will be used to make a string match in Wikipedia Category field").withShortName("c").create();
-
- Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
- "If set, then the category name must exactly match the "
- + "entry in the categories file. Default is false").withShortName("e").create();
-
- Option allOpt = obuilder.withLongName("all")
- .withDescription("If set, Select all files. Default is false").withShortName("all").create();
-
- Option removeLabelOpt = obuilder.withLongName("removeLabels")
- .withDescription("If set, remove [[Category:labels]] from document text after extracting label."
- + "Default is false").withShortName("rl").create();
-
- Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
- .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
- .withOption(removeLabelOpt).create();
-
- Parser parser = new Parser();
- parser.setGroup(group);
- parser.setHelpOption(helpOpt);
- try {
- CommandLine cmdLine = parser.parse(args);
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
-
- String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
- String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
-
- String catFile = "";
- if (cmdLine.hasOption(categoriesOpt)) {
- catFile = (String) cmdLine.getValue(categoriesOpt);
- }
-
- boolean all = false;
- if (cmdLine.hasOption(allOpt)) {
- all = true;
- }
-
- boolean removeLabels = false;
- if (cmdLine.hasOption(removeLabelOpt)) {
- removeLabels = true;
- }
-
- runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all, removeLabels);
- } catch (OptionException | InterruptedException | ClassNotFoundException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- }
- }
-
- /**
- * Run the job
- *
- * @param input
- * the input pathname String
- * @param output
- * the output pathname String
- * @param catFile
- * the file containing the Wikipedia categories
- * @param exactMatchOnly
- * if true, then the Wikipedia category must match exactly instead of simply containing the
- * category string
- * @param all
- * if true select all categories
- * @param removeLabels
- * if true remove Category labels from document text after extracting.
- *
- */
- public static void runJob(String input,
- String output,
- String catFile,
- boolean exactMatchOnly,
- boolean all,
- boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
- conf.set("xmlinput.start", "<page>");
- conf.set("xmlinput.end", "</page>");
- conf.setBoolean("exact.match.only", exactMatchOnly);
- conf.setBoolean("all.files", all);
- conf.setBoolean("remove.labels", removeLabels);
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
-
- Set<String> categories = new HashSet<>();
- if (!catFile.isEmpty()) {
- for (String line : new FileLineIterable(new File(catFile))) {
- categories.add(line.trim().toLowerCase(Locale.ENGLISH));
- }
- }
-
- Stringifier<Set<String>> setStringifier =
- new DefaultStringifier<>(conf, GenericsUtil.getClass(categories));
-
- String categoriesStr = setStringifier.toString(categories);
- conf.set("wikipedia.categories", categoriesStr);
-
- Job job = new Job(conf);
- log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, catFile, all);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.setInputPaths(job, new Path(input));
- Path outPath = new Path(output);
- FileOutputFormat.setOutputPath(job, outPath);
- job.setMapperClass(WikipediaMapper.class);
- job.setInputFormatClass(XmlInputFormat.class);
- job.setReducerClass(Reducer.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
- job.setJarByClass(WikipediaToSequenceFile.class);
-
- /*
- * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
- * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
- * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
- */
- HadoopUtil.delete(conf, outPath);
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- throw new IllegalStateException("Job failed!");
- }
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
deleted file mode 100644
index d50323d..0000000
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
-
-
-public class WikipediaAnalyzer extends StopwordAnalyzerBase {
-
- public WikipediaAnalyzer() {
- super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- }
-
- public WikipediaAnalyzer(CharArraySet stopSet) {
- super(stopSet);
- }
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new WikipediaTokenizer();
- TokenStream result = new StandardFilter(tokenizer);
- result = new LowerCaseFilter(result);
- result = new StopFilter(result, getStopwordSet());
- return new TokenStreamComponents(tokenizer, result);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
deleted file mode 100644
index 8214407..0000000
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.Stringifier;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.GenericsUtil;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Create and run the Wikipedia Dataset Creator.
- */
-public final class WikipediaDatasetCreatorDriver {
- private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorDriver.class);
-
- private WikipediaDatasetCreatorDriver() { }
-
- /**
- * Takes in two arguments:
- * <ol>
- * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
- * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
- * {@link org.apache.hadoop.io.SequenceFile}</li>
- * </ol>
- */
- public static void main(String[] args) throws IOException, InterruptedException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
-
- Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();
-
- Option categoriesOpt = obuilder.withLongName("categories").withRequired(true).withArgument(
- abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription(
- "Location of the categories file. One entry per line. "
- + "Will be used to make a string match in Wikipedia Category field").withShortName("c").create();
-
- Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
- "If set, then the category name must exactly match the "
- + "entry in the categories file. Default is false").withShortName("e").create();
- Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false).withArgument(
- abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).withDescription(
- "The analyzer to use, must have a no argument constructor").withShortName("a").create();
- Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
- .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt)
- .create();
-
- Parser parser = new Parser();
- parser.setGroup(group);
- try {
- CommandLine cmdLine = parser.parse(args);
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
-
- String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
- String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
- String catFile = (String) cmdLine.getValue(categoriesOpt);
- Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
- if (cmdLine.hasOption(analyzerOpt)) {
- String className = cmdLine.getValue(analyzerOpt).toString();
- analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
- // try instantiating it, b/c there isn't any point in setting it if
- // you can't instantiate it
- ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
- }
- runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt),
- analyzerClass);
- } catch (OptionException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- } catch (ClassNotFoundException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- }
- }
-
- /**
- * Run the job
- *
- * @param input
- * the input pathname String
- * @param output
- * the output pathname String
- * @param catFile
- * the file containing the Wikipedia categories
- * @param exactMatchOnly
- * if true, then the Wikipedia category must match exactly instead of simply containing the
- * category string
- */
- public static void runJob(String input,
- String output,
- String catFile,
- boolean exactMatchOnly,
- Class<? extends Analyzer> analyzerClass)
- throws IOException, InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
- conf.set("key.value.separator.in.input.line", " ");
- conf.set("xmlinput.start", "<page>");
- conf.set("xmlinput.end", "</page>");
- conf.setBoolean("exact.match.only", exactMatchOnly);
- conf.set("analyzer.class", analyzerClass.getName());
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
- // Dont ever forget this. People should keep track of how hadoop conf
- // parameters can make or break a piece of code
-
- Set<String> categories = new HashSet<>();
- for (String line : new FileLineIterable(new File(catFile))) {
- categories.add(line.trim().toLowerCase(Locale.ENGLISH));
- }
-
- Stringifier<Set<String>> setStringifier =
- new DefaultStringifier<>(conf, GenericsUtil.getClass(categories));
-
- String categoriesStr = setStringifier.toString(categories);
-
- conf.set("wikipedia.categories", categoriesStr);
-
- Job job = new Job(conf);
- log.info("Input: {} Out: {} Categories: {}", input, output, catFile);
- job.setJarByClass(WikipediaDatasetCreatorDriver.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- job.setMapperClass(WikipediaDatasetCreatorMapper.class);
- //TODO: job.setNumMapTasks(100);
- job.setInputFormatClass(XmlInputFormat.class);
- job.setReducerClass(WikipediaDatasetCreatorReducer.class);
- job.setOutputFormatClass(TextOutputFormat.class);
-
- FileInputFormat.setInputPaths(job, new Path(input));
- Path outPath = new Path(output);
- FileOutputFormat.setOutputPath(job, outPath);
- HadoopUtil.delete(conf, outPath);
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- throw new IllegalStateException("Job failed!");
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
deleted file mode 100644
index 50e5f37..0000000
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import com.google.common.io.Closeables;
-import org.apache.commons.lang3.StringEscapeUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.util.GenericsUtil;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.ClassUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Pattern;
-
-/**
- * Maps over Wikipedia xml format and output all document having the category listed in the input category
- * file
- *
- */
-public class WikipediaDatasetCreatorMapper extends Mapper<LongWritable, Text, Text, Text> {
-
- private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
-
- private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
- private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
- private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
-
- private List<String> inputCategories;
- private List<Pattern> inputCategoryPatterns;
- private boolean exactMatchOnly;
- private Analyzer analyzer;
-
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String document = value.toString();
- document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN.matcher(
- OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
- String catMatch = findMatchingCategory(document);
- if (!"Unknown".equals(catMatch)) {
- StringBuilder contents = new StringBuilder(1000);
- TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
- CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
- stream.reset();
- while (stream.incrementToken()) {
- contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
- }
- context.write(
- new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
- new Text(contents.toString()));
- stream.end();
- Closeables.close(stream, true);
- }
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
-
- Configuration conf = context.getConfiguration();
-
- if (inputCategories == null) {
- Set<String> newCategories = new HashSet<>();
- DefaultStringifier<Set<String>> setStringifier =
- new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));
- String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories));
- Set<String> inputCategoriesSet = setStringifier.fromString(categoriesStr);
- inputCategories = new ArrayList<>(inputCategoriesSet);
- inputCategoryPatterns = new ArrayList<>(inputCategories.size());
- for (String inputCategory : inputCategories) {
- inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + "\\b.*"));
- }
-
- }
-
- exactMatchOnly = conf.getBoolean("exact.match.only", false);
-
- if (analyzer == null) {
- String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName());
- analyzer = ClassUtils.instantiateAs(analyzerStr, Analyzer.class);
- }
-
- log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}",
- inputCategories.size(), exactMatchOnly, analyzer.getClass().getName());
- }
-
- private String findMatchingCategory(String document) {
- int startIndex = 0;
- int categoryIndex;
- while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
- categoryIndex += 11;
- int endIndex = document.indexOf("]]", categoryIndex);
- if (endIndex >= document.length() || endIndex < 0) {
- break;
- }
- String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
- // categories.add(category.toLowerCase());
- if (exactMatchOnly && inputCategories.contains(category)) {
- return category;
- }
- if (!exactMatchOnly) {
- for (int i = 0; i < inputCategories.size(); i++) {
- String inputCategory = inputCategories.get(i);
- Pattern inputCategoryPattern = inputCategoryPatterns.get(i);
- if (inputCategoryPattern.matcher(category).matches()) { // inexact match with word boundary.
- return inputCategory;
- }
- }
- }
- startIndex = endIndex;
- }
- return "Unknown";
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
deleted file mode 100644
index bf921fc..0000000
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.IOException;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-
-/**
- * Can also be used as a local Combiner
- */
-public class WikipediaDatasetCreatorReducer extends Reducer<Text, Text, Text, Text> {
-
- @Override
- protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
- // Key is label,word, value is the number of times we've seen this label
- // word per local node. Output is the same
- for (Text value : values) {
- context.write(key, value);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
deleted file mode 100644
index abd3a04..0000000
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.lang3.StringEscapeUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.util.GenericsUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Maps over Wikipedia xml format and output all document having the category listed in the input category
- * file
- *
- */
-public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
-
- private static final Logger log = LoggerFactory.getLogger(WikipediaMapper.class);
-
- private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s]");
-
- private static final String START_DOC = "<text xml:space=\"preserve\">";
-
- private static final String END_DOC = "</text>";
-
- private static final Pattern TITLE = Pattern.compile("<title>(.*)<\\/title>");
-
- private static final String REDIRECT = "<redirect />";
-
- private Set<String> inputCategories;
-
- private boolean exactMatchOnly;
-
- private boolean all;
-
- private boolean removeLabels;
-
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
-
- String content = value.toString();
- if (content.contains(REDIRECT)) {
- return;
- }
- String document;
- String title;
- try {
- document = getDocument(content);
- title = getTitle(content);
- } catch (RuntimeException e) {
- // TODO: reporter.getCounter("Wikipedia", "Parse errors").increment(1);
- return;
- }
-
- String catMatch = findMatchingCategory(document);
- if (!all) {
- if ("Unknown".equals(catMatch)) {
- return;
- }
- }
-
- document = StringEscapeUtils.unescapeHtml4(document);
- if (removeLabels) {
- document = removeCategoriesFromText(document);
- // Reject documents with malformed tags
- if (document == null) {
- return;
- }
- }
-
- // write out in Bayes input style: key: /Category/document_name
- String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
- SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
-
- context.write(new Text(category), new Text(document));
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Configuration conf = context.getConfiguration();
-
- Set<String> newCategories = new HashSet<>();
- DefaultStringifier<Set<String>> setStringifier =
- new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));
-
- String categoriesStr = conf.get("wikipedia.categories");
- inputCategories = setStringifier.fromString(categoriesStr);
- exactMatchOnly = conf.getBoolean("exact.match.only", false);
- all = conf.getBoolean("all.files", false);
- removeLabels = conf.getBoolean("remove.labels",false);
- log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}",
- inputCategories.size(), all, exactMatchOnly, removeLabels);
- }
-
- private static String getDocument(String xml) {
- int start = xml.indexOf(START_DOC) + START_DOC.length();
- int end = xml.indexOf(END_DOC, start);
- return xml.substring(start, end);
- }
-
- private static String getTitle(CharSequence xml) {
- Matcher m = TITLE.matcher(xml);
- return m.find() ? m.group(1) : "";
- }
-
- private String findMatchingCategory(String document) {
- int startIndex = 0;
- int categoryIndex;
- while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
- categoryIndex += 11;
- int endIndex = document.indexOf("]]", categoryIndex);
- if (endIndex >= document.length() || endIndex < 0) {
- break;
- }
- String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
- if (exactMatchOnly && inputCategories.contains(category)) {
- return category.toLowerCase(Locale.ENGLISH);
- }
- if (!exactMatchOnly) {
- for (String inputCategory : inputCategories) {
- if (category.contains(inputCategory)) { // we have an inexact match
- return inputCategory.toLowerCase(Locale.ENGLISH);
- }
- }
- }
- startIndex = endIndex;
- }
- return "Unknown";
- }
-
- private String removeCategoriesFromText(String document) {
- int startIndex = 0;
- int categoryIndex;
- try {
- while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
- int endIndex = document.indexOf("]]", categoryIndex);
- if (endIndex >= document.length() || endIndex < 0) {
- break;
- }
- document = document.replace(document.substring(categoryIndex, endIndex + 2), "");
- if (categoryIndex < document.length()) {
- startIndex = categoryIndex;
- } else {
- break;
- }
- }
- } catch(StringIndexOutOfBoundsException e) {
- return null;
- }
- return document;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
deleted file mode 100644
index fc065fe..0000000
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.net.URI;
-import java.text.DecimalFormat;
-import java.text.NumberFormat;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.compress.BZip2Codec;
-import org.apache.hadoop.io.compress.CompressionCodec;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * The Bayes example package provides some helper classes for training the Naive Bayes classifier
- * on the Twenty Newsgroups data. See {@code PrepareTwentyNewsgroups}
- * for details on running the trainer and
- * formatting the Twenty Newsgroups data properly for the training.
- *
- * The easiest way to prepare the data is to use the ant task in core/build.xml:
- *
- * {@code ant extract-20news-18828}
- *
- * This runs the arg line:
- *
- * {@code -p $\{working.dir\}/20news-18828/ -o $\{working.dir\}/20news-18828-collapse -a $\{analyzer\} -c UTF-8}
- *
- * To Run the Wikipedia examples (assumes you've built the Mahout Job jar):
- *
- * <ol>
- * <li>Download the Wikipedia Dataset. Use the Ant target: {@code ant enwiki-files}</li>
- * <li>Chunk the data using the WikipediaXmlSplitter (from the Hadoop home):
- * {@code bin/hadoop jar $MAHOUT_HOME/target/mahout-examples-0.x
- * org.apache.mahout.classifier.bayes.WikipediaXmlSplitter
- * -d $MAHOUT_HOME/examples/temp/enwiki-latest-pages-articles.xml
- * -o $MAHOUT_HOME/examples/work/wikipedia/chunks/ -c 64}</li>
- * </ol>
- */
-public final class WikipediaXmlSplitter {
-
- private static final Logger log = LoggerFactory.getLogger(WikipediaXmlSplitter.class);
-
- private WikipediaXmlSplitter() { }
-
- public static void main(String[] args) throws IOException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true).withArgument(
- abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).withDescription(
- "The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d").create();
-
- Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument(
- abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output directory to place the splits in:\n"
- + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
- + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
- + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
- + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")
-
- .withShortName("o").create();
-
- Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false).withArgument(
- abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 ID key")
- .withShortName("i").create();
- Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false).withArgument(
- abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()).withDescription(
- "Amazon S3 secret key").withShortName("s").create();
-
- Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true).withArgument(
- abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Size of the chunk, in megabytes").withShortName("c").create();
- Option numChunksOpt = obuilder
- .withLongName("numChunks")
- .withRequired(false)
- .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "The maximum number of chunks to create. If specified, program will only create a subset of the chunks")
- .withShortName("n").create();
- Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(
- chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt).create();
-
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine;
- try {
- cmdLine = parser.parse(args);
- } catch (OptionException e) {
- log.error("Error while parsing options", e);
- CommandLineUtil.printHelp(group);
- return;
- }
-
- Configuration conf = new Configuration();
- String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
- String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
-
- if (cmdLine.hasOption(s3IdOpt)) {
- String id = (String) cmdLine.getValue(s3IdOpt);
- conf.set("fs.s3n.awsAccessKeyId", id);
- conf.set("fs.s3.awsAccessKeyId", id);
- }
- if (cmdLine.hasOption(s3SecretOpt)) {
- String secret = (String) cmdLine.getValue(s3SecretOpt);
- conf.set("fs.s3n.awsSecretAccessKey", secret);
- conf.set("fs.s3.awsSecretAccessKey", secret);
- }
- // do not compute crc file when using local FS
- conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
- FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);
-
- int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
-
- int numChunks = Integer.MAX_VALUE;
- if (cmdLine.hasOption(numChunksOpt)) {
- numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
- }
-
- String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
- + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
- + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
- + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" "
- + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
- + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
- + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n"
- + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n"
- + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n"
- + " <namespace key=\"1\">Talk</namespace>\n"
- + " <namespace key=\"2\">User</namespace>\n"
- + " <namespace key=\"3\">User talk</namespace>\n"
- + " <namespace key=\"4\">Wikipedia</namespace>\n"
- + " <namespace key=\"5\">Wikipedia talk</namespace>\n"
- + " <namespace key=\"6\">Image</namespace>\n"
- + " <namespace key=\"7\">Image talk</namespace>\n"
- + " <namespace key=\"8\">MediaWiki</namespace>\n"
- + " <namespace key=\"9\">MediaWiki talk</namespace>\n"
- + " <namespace key=\"10\">Template</namespace>\n"
- + " <namespace key=\"11\">Template talk</namespace>\n"
- + " <namespace key=\"12\">Help</namespace>\n"
- + " <namespace key=\"13\">Help talk</namespace>\n"
- + " <namespace key=\"14\">Category</namespace>\n"
- + " <namespace key=\"15\">Category talk</namespace>\n"
- + " <namespace key=\"100\">Portal</namespace>\n"
- + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n"
- + " </siteinfo>\n";
-
- StringBuilder content = new StringBuilder();
- content.append(header);
- NumberFormat decimalFormatter = new DecimalFormat("0000");
- File dumpFile = new File(dumpFilePath);
-
- // If the specified path for the input file is incorrect, return immediately
- if (!dumpFile.exists()) {
- log.error("Input file path {} doesn't exist", dumpFilePath);
- return;
- }
-
- FileLineIterator it;
- if (dumpFilePath.endsWith(".bz2")) {
- // default compression format from http://download.wikimedia.org
- CompressionCodec codec = new BZip2Codec();
- it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
- } else {
- // assume the user has previously de-compressed the dump file
- it = new FileLineIterator(dumpFile);
- }
- int fileNumber = 0;
- while (it.hasNext()) {
- String thisLine = it.next();
- if (thisLine.trim().startsWith("<page>")) {
- boolean end = false;
- while (!thisLine.trim().startsWith("</page>")) {
- content.append(thisLine).append('\n');
- if (it.hasNext()) {
- thisLine = it.next();
- } else {
- end = true;
- break;
- }
- }
- content.append(thisLine).append('\n');
-
- if (content.length() > chunkSize || end) {
- content.append("</mediawiki>");
- fileNumber++;
- String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
- try (BufferedWriter chunkWriter =
- new BufferedWriter(new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"))) {
- chunkWriter.write(content.toString(), 0, content.length());
- }
- if (fileNumber >= numChunks) {
- break;
- }
- content = new StringBuilder();
- content.append(header);
- }
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
deleted file mode 100644
index afd350f..0000000
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import com.google.common.io.Closeables;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-/**
- * Reads records that are delimited by a specific begin/end tag.
- */
-public class XmlInputFormat extends TextInputFormat {
-
- private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class);
-
- public static final String START_TAG_KEY = "xmlinput.start";
- public static final String END_TAG_KEY = "xmlinput.end";
-
- @Override
- public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
- try {
- return new XmlRecordReader((FileSplit) split, context.getConfiguration());
- } catch (IOException ioe) {
- log.warn("Error while creating XmlRecordReader", ioe);
- return null;
- }
- }
-
- /**
- * XMLRecordReader class to read through a given xml document to output xml blocks as records as specified
- * by the start tag and end tag
- *
- */
- public static class XmlRecordReader extends RecordReader<LongWritable, Text> {
-
- private final byte[] startTag;
- private final byte[] endTag;
- private final long start;
- private final long end;
- private final FSDataInputStream fsin;
- private final DataOutputBuffer buffer = new DataOutputBuffer();
- private LongWritable currentKey;
- private Text currentValue;
-
- public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
- startTag = conf.get(START_TAG_KEY).getBytes(Charsets.UTF_8);
- endTag = conf.get(END_TAG_KEY).getBytes(Charsets.UTF_8);
-
- // open the file and seek to the start of the split
- start = split.getStart();
- end = start + split.getLength();
- Path file = split.getPath();
- FileSystem fs = file.getFileSystem(conf);
- fsin = fs.open(split.getPath());
- fsin.seek(start);
- }
-
- private boolean next(LongWritable key, Text value) throws IOException {
- if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
- try {
- buffer.write(startTag);
- if (readUntilMatch(endTag, true)) {
- key.set(fsin.getPos());
- value.set(buffer.getData(), 0, buffer.getLength());
- return true;
- }
- } finally {
- buffer.reset();
- }
- }
- return false;
- }
-
- @Override
- public void close() throws IOException {
- Closeables.close(fsin, true);
- }
-
- @Override
- public float getProgress() throws IOException {
- return (fsin.getPos() - start) / (float) (end - start);
- }
-
- private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
- int i = 0;
- while (true) {
- int b = fsin.read();
- // end of file:
- if (b == -1) {
- return false;
- }
- // save to buffer:
- if (withinBlock) {
- buffer.write(b);
- }
-
- // check if we're matching:
- if (b == match[i]) {
- i++;
- if (i >= match.length) {
- return true;
- }
- } else {
- i = 0;
- }
- // see if we've passed the stop point:
- if (!withinBlock && i == 0 && fsin.getPos() >= end) {
- return false;
- }
- }
- }
-
- @Override
- public LongWritable getCurrentKey() throws IOException, InterruptedException {
- return currentKey;
- }
-
- @Override
- public Text getCurrentValue() throws IOException, InterruptedException {
- return currentValue;
- }
-
- @Override
- public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
- }
-
- @Override
- public boolean nextKeyValue() throws IOException, InterruptedException {
- currentKey = new LongWritable();
- currentValue = new Text();
- return next(currentKey, currentValue);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/Bump125.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/Bump125.java b/integration/src/main/java/org/apache/mahout/utils/Bump125.java
deleted file mode 100644
index 1c55090..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/Bump125.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-/**
- * Helps with making nice intervals at arbitrary scale.
- *
- * One use case is where we are producing progress or error messages every time an incoming
- * record is received. It is generally bad form to produce a message for every input
- * so it would be better to produce a message for each of the first 10 records, then every
- * other record up to 20 and then every 5 records up to 50 and then every 10 records up to 100,
- * more or less. The pattern can now repeat scaled up by 100. The total number of messages will scale
- * with the log of the number of input lines which is much more survivable than direct output
- * and because early records all get messages, we get indications early.
- */
-public class Bump125 {
- private static final int[] BUMPS = {1, 2, 5};
-
- static int scale(double value, double base) {
- double scale = value / base;
- // scan for correct step
- int i = 0;
- while (i < BUMPS.length - 1 && BUMPS[i + 1] <= scale) {
- i++;
- }
- return BUMPS[i];
- }
-
- static long base(double value) {
- return Math.max(1, (long) Math.pow(10, (int) Math.floor(Math.log10(value))));
- }
-
- private long counter = 0;
-
- public long increment() {
- long delta;
- if (counter >= 10) {
- long base = base(counter / 4.0);
- int scale = scale(counter / 4.0, base);
- delta = base * scale;
- } else {
- delta = 1;
- }
- counter += delta;
- return counter;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java b/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
deleted file mode 100644
index f63de83..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.PrintStream;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.MatrixWritable;
-
-/**
- * Export a Matrix in various text formats:
- * * CSV file
- *
- * Input format: Hadoop SequenceFile with Text key and MatrixWritable value, 1 pair
- * TODO:
- * Needs class for key value- should not hard-code to Text.
- * Options for row and column headers- stats software can be picky.
- * Assumes only one matrix in a file.
- */
-public final class MatrixDumper extends AbstractJob {
-
- private MatrixDumper() { }
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new MatrixDumper(), args);
- }
-
- @Override
- public int run(String[] args) throws Exception {
-
- addInputOption();
- addOption("output", "o", "Output path", null); // AbstractJob output feature requires param
- Map<String, List<String>> parsedArgs = parseArguments(args);
- if (parsedArgs == null) {
- return -1;
- }
- String outputFile = hasOption("output") ? getOption("output") : null;
- exportCSV(getInputPath(), outputFile, false);
- return 0;
- }
-
- private static void exportCSV(Path inputPath, String outputFile, boolean doLabels) throws IOException {
- SequenceFileValueIterator<MatrixWritable> it =
- new SequenceFileValueIterator<>(inputPath, true, new Configuration());
- Matrix m = it.next().get();
- it.close();
- PrintStream ps = getPrintStream(outputFile);
- String[] columnLabels = getLabels(m.numCols(), m.getColumnLabelBindings(), "col");
- String[] rowLabels = getLabels(m.numRows(), m.getRowLabelBindings(), "row");
- if (doLabels) {
- ps.print("rowid,");
- ps.print(columnLabels[0]);
- for (int c = 1; c < m.numCols(); c++) {
- ps.print(',' + columnLabels[c]);
- }
- ps.println();
- }
- for (int r = 0; r < m.numRows(); r++) {
- if (doLabels) {
- ps.print(rowLabels[0] + ',');
- }
- ps.print(Double.toString(m.getQuick(r,0)));
- for (int c = 1; c < m.numCols(); c++) {
- ps.print(",");
- ps.print(Double.toString(m.getQuick(r,c)));
- }
- ps.println();
- }
- if (ps != System.out) {
- ps.close();
- }
- }
-
- private static PrintStream getPrintStream(String outputPath) throws IOException {
- if (outputPath == null) {
- return System.out;
- }
- File outputFile = new File(outputPath);
- if (outputFile.exists()) {
- outputFile.delete();
- }
- outputFile.createNewFile();
- OutputStream os = new FileOutputStream(outputFile);
- return new PrintStream(os, false, Charsets.UTF_8.displayName());
- }
-
- /**
- * return the label set, sorted by matrix order
- * if there are no labels, fabricate them using the starter string
- * @param length
- */
- private static String[] getLabels(int length, Map<String,Integer> labels, String start) {
- if (labels != null) {
- return sortLabels(labels);
- }
- String[] sorted = new String[length];
- for (int i = 1; i <= length; i++) {
- sorted[i] = start + i;
- }
- return sorted;
- }
-
- private static String[] sortLabels(Map<String,Integer> labels) {
- String[] sorted = new String[labels.size()];
- for (Map.Entry<String,Integer> entry : labels.entrySet()) {
- sorted[entry.getValue()] = entry.getKey();
- }
- return sorted;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java b/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
deleted file mode 100644
index e01868a..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.File;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.apache.mahout.math.list.IntArrayList;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-
-public final class SequenceFileDumper extends AbstractJob {
-
- public SequenceFileDumper() {
- setConf(new Configuration());
- }
-
- @Override
- public int run(String[] args) throws Exception {
-
- addInputOption();
- addOutputOption();
- addOption("substring", "b", "The number of chars to print out per value", false);
- addOption(buildOption("count", "c", "Report the count only", false, false, null));
- addOption("numItems", "n", "Output at most <n> key value pairs", false);
- addOption(buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, "
- + "this can take up a fair amount of memory", false, false, null));
- addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));
-
- if (parseArguments(args, false, true) == null) {
- return -1;
- }
-
- Path[] pathArr;
- Configuration conf = new Configuration();
- Path input = getInputPath();
- FileSystem fs = input.getFileSystem(conf);
- if (fs.getFileStatus(input).isDir()) {
- pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
- } else {
- pathArr = new Path[1];
- pathArr[0] = input;
- }
-
-
- Writer writer;
- boolean shouldClose;
- if (hasOption("output")) {
- shouldClose = true;
- writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
- } else {
- shouldClose = false;
- writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
- }
- try {
- for (Path path : pathArr) {
- if (!hasOption("quiet")) {
- writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
- }
-
- int sub = Integer.MAX_VALUE;
- if (hasOption("substring")) {
- sub = Integer.parseInt(getOption("substring"));
- }
- boolean countOnly = hasOption("count");
- SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<>(path, true, conf);
- if (!hasOption("quiet")) {
- writer.append("Key class: ").append(iterator.getKeyClass().toString());
- writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
- }
- OpenObjectIntHashMap<String> facets = null;
- if (hasOption("facets")) {
- facets = new OpenObjectIntHashMap<>();
- }
- long count = 0;
- if (countOnly) {
- while (iterator.hasNext()) {
- Pair<?, ?> record = iterator.next();
- String key = record.getFirst().toString();
- if (facets != null) {
- facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
- }
- count++;
- }
- writer.append("Count: ").append(String.valueOf(count)).append('\n');
- } else {
- long numItems = Long.MAX_VALUE;
- if (hasOption("numItems")) {
- numItems = Long.parseLong(getOption("numItems"));
- if (!hasOption("quiet")) {
- writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
- }
- }
- while (iterator.hasNext() && count < numItems) {
- Pair<?, ?> record = iterator.next();
- String key = record.getFirst().toString();
- writer.append("Key: ").append(key);
- String str = record.getSecond().toString();
- writer.append(": Value: ").append(str.length() > sub
- ? str.substring(0, sub) : str);
- writer.write('\n');
- if (facets != null) {
- facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
- }
- count++;
- }
- if (!hasOption("quiet")) {
- writer.append("Count: ").append(String.valueOf(count)).append('\n');
- }
- }
- if (facets != null) {
- List<String> keyList = new ArrayList<>(facets.size());
-
- IntArrayList valueList = new IntArrayList(facets.size());
- facets.pairsSortedByKey(keyList, valueList);
- writer.append("-----Facets---\n");
- writer.append("Key\t\tCount\n");
- int i = 0;
- for (String key : keyList) {
- writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
- }
- }
- }
- writer.flush();
-
- } finally {
- if (shouldClose) {
- Closeables.close(writer, false);
- }
- }
-
-
- return 0;
- }
-
- public static void main(String[] args) throws Exception {
- new SequenceFileDumper().run(args);
- }
-
-}

r***@apache.org

2018-06-27 14:51:55 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
deleted file mode 100644
index 180a1e1..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.Iterator;
-import java.util.Locale;
-
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.Vector;
-
-/**
- * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s
- * 
- * Attribute type handling:
- * <ul>
- * <li>Numeric -> As is</li>
- * <li>Nominal -> ordinal(value) i.e. @attribute lumber {'\'(-inf-0.5]\'','\'(0.5-inf)\''}
- * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li>
- * <li>Dates -> Convert to time as a long</li>
- * <li>Strings -> Create a map of String -> long</li>
- * </ul>
- * NOTE: This class does not set the label bindings on every vector. If you want the label
- * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are the same for every vector.
- */
-public class ARFFVectorIterable implements Iterable<Vector> {
-
- private final BufferedReader buff;
- private final ARFFModel model;
-
- public ARFFVectorIterable(File file, ARFFModel model) throws IOException {
- this(file, Charsets.UTF_8, model);
- }
-
- public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) throws IOException {
- this(Files.newReader(file, encoding), model);
- }
-
- public ARFFVectorIterable(String arff, ARFFModel model) throws IOException {
- this(new StringReader(arff), model);
- }
-
- public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException {
- if (reader instanceof BufferedReader) {
- buff = (BufferedReader) reader;
- } else {
- buff = new BufferedReader(reader);
- }
- //grab the attributes, then start the iterator at the first line of data
- this.model = model;
-
- int labelNumber = 0;
- String line;
- while ((line = buff.readLine()) != null) {
- line = line.trim();
- if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) {
- Integer labelNumInt = labelNumber;
- String[] lineParts = line.split("[\\s\\t]+", 2);
-
- // is it a relation name?
- if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) {
- model.setRelation(ARFFType.removeQuotes(lineParts[1]));
- }
- // or an attribute
- else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) {
- String label;
- ARFFType type;
-
- // split the name of the attribute and its description
- String[] attrParts = lineParts[1].split("[\\s\\t]+", 2);
- if (attrParts.length < 2)
- throw new UnsupportedOperationException("No type for attribute found: " + lineParts[1]);
-
- // label is attribute name
- label = ARFFType.removeQuotes(attrParts[0].toLowerCase());
- if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) {
- type = ARFFType.NUMERIC;
- } else if (attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) {
- type = ARFFType.INTEGER;
- } else if (attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) {
- type = ARFFType.REAL;
- } else if (attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) {
- type = ARFFType.STRING;
- } else if (attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) {
- type = ARFFType.NOMINAL;
- // nominal example:
- // @ATTRIBUTE class {Iris-setosa,'Iris versicolor',Iris-virginica}
- String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, attrParts[1].length() - 1));
- for (int i = 0; i < classes.length; i++) {
- model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
- }
- } else if (attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) {
- type = ARFFType.DATE;
- //TODO: DateFormatter map
- DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
- String formStr = attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim();
- if (!formStr.isEmpty()) {
- if (formStr.startsWith("\"")) {
- formStr = formStr.substring(1, formStr.length() - 1);
- }
- format = new SimpleDateFormat(formStr, Locale.ENGLISH);
- }
- model.addDateFormat(labelNumInt, format);
- //@attribute <name> date [<date-format>]
- } else {
- throw new UnsupportedOperationException("Invalid attribute: " + attrParts[1]);
- }
- model.addLabel(label, labelNumInt);
- model.addType(labelNumInt, type);
- labelNumber++;
- } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) {
- break; //skip it
- }
- }
- }
-
- }
-
- @Override
- public Iterator<Vector> iterator() {
- return new ARFFIterator(buff, model);
- }
-
- /**
- * Returns info about the ARFF content that was parsed.
- *
- * @return the model
- */
- public ARFFModel getModel() {
- return model;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
deleted file mode 100644
index ccecbb1..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
+++ /dev/null
@@ -1,263 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
-import org.apache.mahout.utils.vectors.io.VectorWriter;
-import org.codehaus.jackson.map.ObjectMapper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Driver {
-
- private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
- /** used for JSON serialization/deserialization */
- private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-
- private Driver() {
- }
-
- public static void main(String[] args) throws IOException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt = obuilder
- .withLongName("input")
- .withRequired(true)
- .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "The file or directory containing the ARFF files. If it is a directory, all .arff files will be converted")
- .withShortName("d").create();
-
- Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output directory. Files will have the same name as the input, but with the extension .mvc")
- .withShortName("o").create();
-
- Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
- abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
- "The maximum number of vectors to output. If not specified, then it will loop over all docs")
- .withShortName("m").create();
-
- Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
- abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
- "The file to output the label bindings").withShortName("t").create();
-
- Option jsonDictonaryOpt = obuilder.withLongName("json-dictonary").withRequired(false)
- .withDescription("Write dictonary in JSON format").withShortName("j").create();
-
- Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
- abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
- "The delimiter for outputing the dictionary").withShortName("l").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
- .create();
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
- .withOption(helpOpt).withOption(dictOutOpt).withOption(jsonDictonaryOpt).withOption(delimiterOpt)
- .create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
-
- CommandLineUtil.printHelp(group);
- return;
- }
- if (cmdLine.hasOption(inputOpt)) { // Lucene case
- File input = new File(cmdLine.getValue(inputOpt).toString());
- long maxDocs = Long.MAX_VALUE;
- if (cmdLine.hasOption(maxOpt)) {
- maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
- }
- if (maxDocs < 0) {
- throw new IllegalArgumentException("maxDocs must be >= 0");
- }
- String outDir = cmdLine.getValue(outputOpt).toString();
- log.info("Output Dir: {}", outDir);
-
- String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
- File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
- boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt);
- ARFFModel model = new MapBackedARFFModel();
- if (input.exists() && input.isDirectory()) {
- File[] files = input.listFiles(new FilenameFilter() {
- @Override
- public boolean accept(File file, String name) {
- return name.endsWith(".arff");
- }
- });
-
- for (File file : files) {
- writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary);
- }
- } else {
- writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary);
- }
- }
-
- } catch (OptionException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- }
- }
-
- protected static void writeLabelBindings(File dictOut, ARFFModel arffModel, String delimiter, boolean jsonDictonary)
- throws IOException {
- try (Writer writer = Files.newWriterSupplier(dictOut, Charsets.UTF_8, true).getOutput()) {
- if (jsonDictonary) {
- writeLabelBindingsJSON(writer, arffModel);
- } else {
- writeLabelBindings(writer, arffModel, delimiter);
- }
- }
- }
-
- protected static void writeLabelBindingsJSON(Writer writer, ARFFModel arffModel) throws IOException {
-
- // Turn the map of labels into a list order by order of appearance
- List<Entry<String, Integer>> attributes = new ArrayList<>();
- attributes.addAll(arffModel.getLabelBindings().entrySet());
- Collections.sort(attributes, new Comparator<Map.Entry<String, Integer>>() {
- @Override
- public int compare(Entry<String, Integer> t, Entry<String, Integer> t1) {
- return t.getValue().compareTo(t1.getValue());
- }
- });
-
- // write a map for each object
- List<Map<String, Object>> jsonObjects = new LinkedList<>();
- for (int i = 0; i < attributes.size(); i++) {
-
- Entry<String, Integer> modelRepresentation = attributes.get(i);
- Map<String, Object> jsonRepresentation = new HashMap<>();
- jsonObjects.add(jsonRepresentation);
- // the last one is the class label
- jsonRepresentation.put("label", i < (attributes.size() - 1) ? String.valueOf(false) : String.valueOf(true));
- String attribute = modelRepresentation.getKey();
- jsonRepresentation.put("attribute", attribute);
- Map<String, Integer> nominalValues = arffModel.getNominalMap().get(attribute);
-
- if (nominalValues != null) {
- String[] values = nominalValues.keySet().toArray(new String[1]);
-
- jsonRepresentation.put("values", values);
- jsonRepresentation.put("type", "categorical");
- } else {
- jsonRepresentation.put("type", "numerical");
- }
- }
- writer.write(OBJECT_MAPPER.writeValueAsString(jsonObjects));
- }
-
- protected static void writeLabelBindings(Writer writer, ARFFModel arffModel, String delimiter) throws IOException {
-
- Map<String, Integer> labels = arffModel.getLabelBindings();
- writer.write("Label bindings for Relation " + arffModel.getRelation() + '\n');
- for (Map.Entry<String, Integer> entry : labels.entrySet()) {
- writer.write(entry.getKey());
- writer.write(delimiter);
- writer.write(String.valueOf(entry.getValue()));
- writer.write('\n');
- }
- writer.write('\n');
- writer.write("Values for nominal attributes\n");
- // emit allowed values for NOMINAL/categorical/enumerated attributes
- Map<String, Map<String, Integer>> nominalMap = arffModel.getNominalMap();
- // how many nominal attributes
- writer.write(String.valueOf(nominalMap.size()) + "\n");
-
- for (Entry<String, Map<String, Integer>> entry : nominalMap.entrySet()) {
- // the label of this attribute
- writer.write(entry.getKey() + "\n");
- Set<Entry<String, Integer>> attributeValues = entry.getValue().entrySet();
- // how many values does this attribute have
- writer.write(attributeValues.size() + "\n");
- for (Map.Entry<String, Integer> value : attributeValues) {
- // the value and the value index
- writer.write(String.format("%s%s%s\n", value.getKey(), delimiter, value.getValue().toString()));
- }
- }
- }
-
- protected static void writeFile(String outDir,
- File file,
- long maxDocs,
- ARFFModel arffModel,
- File dictOut,
- String delimiter,
- boolean jsonDictonary) throws IOException {
- log.info("Converting File: {}", file);
- ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1, arffModel
- .getNominalMap());
- Iterable<Vector> iteratable = new ARFFVectorIterable(file, model);
- String outFile = outDir + '/' + file.getName() + ".mvc";
-
- try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
- long numDocs = vectorWriter.write(iteratable, maxDocs);
- writeLabelBindings(dictOut, model, delimiter, jsonDictonary);
- log.info("Wrote: {} vectors", numDocs);
- }
- }
-
- private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
- Path path = new Path(outFile);
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
- VectorWritable.class);
- return new SequenceFileVectorWriter(seqWriter);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
deleted file mode 100644
index e911b1a..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
+++ /dev/null
@@ -1,282 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.text.DateFormat;
-import java.text.NumberFormat;
-import java.text.ParseException;
-import java.text.ParsePosition;
-import java.text.SimpleDateFormat;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Holds ARFF information in {@link Map}.
- */
-public class MapBackedARFFModel implements ARFFModel {
-
- private static final Pattern QUOTE_PATTERN = Pattern.compile("\"");
-
- private long wordCount = 1;
-
- private String relation;
-
- private final Map<String,Integer> labelBindings;
- private final Map<Integer,String> idxLabel;
- private final Map<Integer,ARFFType> typeMap; // key is the vector index, value is the type
- private final Map<Integer,DateFormat> dateMap;
- private final Map<String,Map<String,Integer>> nominalMap;
- private final Map<String,Long> words;
-
- public MapBackedARFFModel() {
- this(new HashMap<String,Long>(), 1, new HashMap<String,Map<String,Integer>>());
- }
-
- public MapBackedARFFModel(Map<String,Long> words, long wordCount, Map<String,Map<String,Integer>> nominalMap) {
- this.words = words;
- this.wordCount = wordCount;
- labelBindings = new HashMap<>();
- idxLabel = new HashMap<>();
- typeMap = new HashMap<>();
- dateMap = new HashMap<>();
- this.nominalMap = nominalMap;
-
- }
-
- @Override
- public String getRelation() {
- return relation;
- }
-
- @Override
- public void setRelation(String relation) {
- this.relation = relation;
- }
-
- /**
- * Convert a piece of String data at a specific spot into a value
- *
- * @param data
- * The data to convert
- * @param idx
- * The position in the ARFF data
- * @return A double representing the data
- */
- @Override
- public double getValue(String data, int idx) {
- ARFFType type = typeMap.get(idx);
- if (type == null) {
- throw new IllegalArgumentException("Attribute type cannot be NULL, attribute index was: " + idx);
- }
- data = QUOTE_PATTERN.matcher(data).replaceAll("");
- data = data.trim();
- double result;
- switch (type) {
- case NUMERIC:
- case INTEGER:
- case REAL:
- result = processNumeric(data);
- break;
- case DATE:
- result = processDate(data, idx);
- break;
- case STRING:
- // may have quotes
- result = processString(data);
- break;
- case NOMINAL:
- String label = idxLabel.get(idx);
- result = processNominal(label, data);
- break;
- default:
- throw new IllegalStateException("Unknown type: " + type);
- }
- return result;
- }
-
- protected double processNominal(String label, String data) {
- double result;
- Map<String,Integer> classes = nominalMap.get(label);
- if (classes != null) {
- Integer ord = classes.get(ARFFType.removeQuotes(data));
- if (ord != null) {
- result = ord;
- } else {
- throw new IllegalStateException("Invalid nominal: " + data + " for label: " + label);
- }
- } else {
- throw new IllegalArgumentException("Invalid nominal label: " + label + " Data: " + data);
- }
-
- return result;
- }
-
- // Not sure how scalable this is going to be
- protected double processString(String data) {
- data = QUOTE_PATTERN.matcher(data).replaceAll("");
- // map it to an long
- Long theLong = words.get(data);
- if (theLong == null) {
- theLong = wordCount++;
- words.put(data, theLong);
- }
- return theLong;
- }
-
- protected static double processNumeric(String data) {
- if (isNumeric(data)) {
- return Double.parseDouble(data);
- }
- return Double.NaN;
- }
-
- public static boolean isNumeric(String str) {
- NumberFormat formatter = NumberFormat.getInstance();
- ParsePosition parsePosition = new ParsePosition(0);
- formatter.parse(str, parsePosition);
- return str.length() == parsePosition.getIndex();
- }
-
- protected double processDate(String data, int idx) {
- DateFormat format = dateMap.get(idx);
- if (format == null) {
- format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
- }
- double result;
- try {
- Date date = format.parse(data);
- result = date.getTime(); // hmmm, what kind of loss casting long to double?
- } catch (ParseException e) {
- throw new IllegalArgumentException(e);
- }
- return result;
- }
-
- /**
- * The vector attributes (labels in Mahout speak), unmodifiable
- *
- * @return the map
- */
- @Override
- public Map<String,Integer> getLabelBindings() {
- return Collections.unmodifiableMap(labelBindings);
- }
-
- /**
- * The map of types encountered
- *
- * @return the map
- */
- public Map<Integer,ARFFType> getTypeMap() {
- return Collections.unmodifiableMap(typeMap);
- }
-
- /**
- * Map of Date formatters used
- *
- * @return the map
- */
- public Map<Integer,DateFormat> getDateMap() {
- return Collections.unmodifiableMap(dateMap);
- }
-
- /**
- * Map nominals to ids. Should only be modified by calling {@link ARFFModel#addNominal(String, String, int)}
- *
- * @return the map
- */
- @Override
- public Map<String,Map<String,Integer>> getNominalMap() {
- return nominalMap;
- }
-
- /**
- * Immutable map of words to the long id used for those words
- *
- * @return The map
- */
- @Override
- public Map<String,Long> getWords() {
- return words;
- }
-
- @Override
- public Integer getNominalValue(String label, String nominal) {
- return nominalMap.get(label).get(nominal);
- }
-
- @Override
- public void addNominal(String label, String nominal, int idx) {
- Map<String,Integer> noms = nominalMap.get(label);
- if (noms == null) {
- noms = new HashMap<>();
- nominalMap.put(label, noms);
- }
- noms.put(nominal, idx);
- }
-
- @Override
- public DateFormat getDateFormat(Integer idx) {
- return dateMap.get(idx);
- }
-
- @Override
- public void addDateFormat(Integer idx, DateFormat format) {
- dateMap.put(idx, format);
- }
-
- @Override
- public Integer getLabelIndex(String label) {
- return labelBindings.get(label);
- }
-
- @Override
- public void addLabel(String label, Integer idx) {
- labelBindings.put(label, idx);
- idxLabel.put(idx, label);
- }
-
- @Override
- public ARFFType getARFFType(Integer idx) {
- return typeMap.get(idx);
- }
-
- @Override
- public void addType(Integer idx, ARFFType type) {
- typeMap.put(idx, type);
- }
-
- /**
- * The count of the number of words seen
- *
- * @return the count
- */
- @Override
- public long getWordCount() {
- return wordCount;
- }
-
- @Override
- public int getLabelSize() {
- return labelBindings.size();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
deleted file mode 100644
index 3c583fd..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.csv;
-
-import java.io.IOException;
-import java.io.Reader;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-
-/**
- * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
- * 
- * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()}
- * method.
- * 
- * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
- * 
- * The Iterator is not thread-safe.
- */
-public class CSVVectorIterator extends AbstractIterator<Vector> {
-
- private final CSVParser parser;
-
- public CSVVectorIterator(Reader reader) {
- parser = new CSVParser(reader);
- }
-
- public CSVVectorIterator(Reader reader, CSVStrategy strategy) {
- parser = new CSVParser(reader, strategy);
- }
-
- @Override
- protected Vector computeNext() {
- String[] line;
- try {
- line = parser.getLine();
- } catch (IOException e) {
- throw new IllegalStateException(e);
- }
- if (line == null) {
- return endOfData();
- }
- Vector result = new DenseVector(line.length);
- for (int i = 0; i < line.length; i++) {
- result.setQuick(i, Double.parseDouble(line[i]));
- }
- return result;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
deleted file mode 100644
index b5f9f2b..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Iterator;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-
-/**
- * Write {@link TermInfo} to a {@link Writer} in a textual, delimited format with header.
- */
-public class DelimitedTermInfoWriter implements TermInfoWriter {
-
- private final Writer writer;
- private final String delimiter;
- private final String field;
-
- public DelimitedTermInfoWriter(Writer writer, String delimiter, String field) {
- this.writer = writer;
- this.delimiter = delimiter;
- this.field = field;
- }
-
- @Override
- public void write(TermInfo ti) throws IOException {
-
- Iterator<TermEntry> entIter = ti.getAllEntries();
- try {
- writer.write(String.valueOf(ti.totalTerms(field)));
- writer.write('\n');
- writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
- writer.write('\n');
- while (entIter.hasNext()) {
- TermEntry entry = entIter.next();
- writer.write(entry.getTerm());
- writer.write(delimiter);
- writer.write(String.valueOf(entry.getDocFreq()));
- writer.write(delimiter);
- writer.write(String.valueOf(entry.getTermIdx()));
- writer.write('\n');
- }
- } finally {
- Closeables.close(writer, false);
- }
- }
-
- /**
- * Does NOT close the underlying writer
- */
- @Override
- public void close() {
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
deleted file mode 100644
index 0d763a1..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-
-/**
- * Writes out Vectors to a SequenceFile.
- *
- * Closes the writer when done
- */
-public class SequenceFileVectorWriter implements VectorWriter {
- private final SequenceFile.Writer writer;
- private long recNum = 0;
- public SequenceFileVectorWriter(SequenceFile.Writer writer) {
- this.writer = writer;
- }
-
- @Override
- public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
-
- for (Vector point : iterable) {
- if (recNum >= maxDocs) {
- break;
- }
- if (point != null) {
- writer.append(new LongWritable(recNum++), new VectorWritable(point));
- }
-
- }
- return recNum;
- }
-
- @Override
- public void write(Vector vector) throws IOException {
- writer.append(new LongWritable(recNum++), new VectorWritable(vector));
-
- }
-
- @Override
- public long write(Iterable<Vector> iterable) throws IOException {
- return write(iterable, Long.MAX_VALUE);
- }
-
- @Override
- public void close() throws IOException {
- Closeables.close(writer, false);
- }
-
- public SequenceFile.Writer getWriter() {
- return writer;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
deleted file mode 100644
index e165b45..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.utils.vectors.TermInfo;
-
-public interface TermInfoWriter extends Closeable {
-
- void write(TermInfo ti) throws IOException;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
deleted file mode 100644
index cc27d1d..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.IOException;
-import java.io.Writer;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.math.Vector;
-
-/**
- * Write out the vectors to any {@link Writer} using {@link Vector#asFormatString()},
- * one per line by default.
- */
-public class TextualVectorWriter implements VectorWriter {
-
- private final Writer writer;
-
- public TextualVectorWriter(Writer writer) {
- this.writer = writer;
- }
-
- protected Writer getWriter() {
- return writer;
- }
-
- @Override
- public long write(Iterable<Vector> iterable) throws IOException {
- return write(iterable, Long.MAX_VALUE);
- }
-
- @Override
- public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
- long result = 0;
- for (Vector vector : iterable) {
- if (result >= maxDocs) {
- break;
- }
- write(vector);
- result++;
- }
- return result;
- }
-
- @Override
- public void write(Vector vector) throws IOException {
- writer.write(vector.asFormatString());
- writer.write('\n');
- }
-
- @Override
- public void close() throws IOException {
- Closeables.close(writer, false);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
deleted file mode 100644
index 923e270..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.math.Vector;
-
-public interface VectorWriter extends Closeable {
- /**
- * Write all values in the Iterable to the output
- * @param iterable The {@link Iterable} to loop over
- * @return the number of docs written
- * @throws IOException if there was a problem writing
- *
- */
- long write(Iterable<Vector> iterable) throws IOException;
-
- /**
- * Write out a vector
- *
- * @param vector The {@link org.apache.mahout.math.Vector} to write
- * @throws IOException
- */
- void write(Vector vector) throws IOException;
-
- /**
- * Write the first {@code maxDocs} to the output.
- * @param iterable The {@link Iterable} to loop over
- * @param maxDocs the maximum number of docs to write
- * @return The number of docs written
- * @throws IOException if there was a problem writing
- */
- long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
deleted file mode 100644
index ff61a70..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.Bump125;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-/**
- * Iterate over a Lucene index, extracting term vectors.
- * Subclasses define how much information to retrieve from the Lucene index.
- */
-public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> {
- private static final Logger log = LoggerFactory.getLogger(LuceneIterator.class);
- protected final IndexReader indexReader;
- protected final String field;
- protected final TermInfo terminfo;
- protected final double normPower;
- protected final Weight weight;
- protected final Bump125 bump = new Bump125();
- protected int nextDocId;
- protected int maxErrorDocs;
- protected int numErrorDocs;
- protected long nextLogRecord = bump.increment();
- protected int skippedErrorMessages;
-
- public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight,
- double maxPercentErrorDocs, String field) {
- this.terminfo = terminfo;
- this.normPower = normPower;
- this.indexReader = indexReader;
-
- this.weight = weight;
- this.nextDocId = 0;
- this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
- this.field = field;
- }
-
- /**
- * Given the document name, derive a name for the vector. This may involve
- * reading the document from Lucene and setting up any other state that the
- * subclass wants. This will be called once for each document that the
- * iterator processes.
- * @param documentIndex the lucene document index.
- * @return the name to store in the vector.
- */
- protected abstract String getVectorName(int documentIndex) throws IOException;
-
- @Override
- protected Vector computeNext() {
- try {
- int doc;
- Terms termFreqVector;
- String name;
-
- do {
- doc = this.nextDocId;
- nextDocId++;
-
- if (doc >= indexReader.maxDoc()) {
- return endOfData();
- }
-
- termFreqVector = indexReader.getTermVector(doc, field);
- name = getVectorName(doc);
-
- if (termFreqVector == null) {
- numErrorDocs++;
- if (numErrorDocs >= maxErrorDocs) {
- log.error("There are too many documents that do not have a term vector for {}", field);
- throw new IllegalStateException("There are too many documents that do not have a term vector for "
- + field);
- }
- if (numErrorDocs >= nextLogRecord) {
- if (skippedErrorMessages == 0) {
- log.warn("{} does not have a term vector for {}", name, field);
- } else {
- log.warn("{} documents do not have a term vector for {}", numErrorDocs, field);
- }
- nextLogRecord = bump.increment();
- skippedErrorMessages = 0;
- } else {
- skippedErrorMessages++;
- }
- }
- } while (termFreqVector == null);
-
- // The loop exits with termFreqVector and name set.
-
- TermsEnum te = termFreqVector.iterator();
- BytesRef term;
- TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo);
- mapper.setExpectations(field, termFreqVector.size());
- while ((term = te.next()) != null) {
- mapper.map(term, (int) te.totalTermFreq());
- }
- Vector result = mapper.getVector();
- if (result == null) {
- // TODO is this right? last version would produce null in the iteration in this case, though it
- // seems like that may not be desirable
- return null;
- }
-
- if (normPower == LuceneIterable.NO_NORMALIZING) {
- result = new NamedVector(result, name);
- } else {
- result = new NamedVector(result.normalize(normPower), name);
- }
- return result;
- } catch (IOException ioe) {
- throw new IllegalStateException(ioe);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
deleted file mode 100644
index 0b59ed6..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-
-
-/**
- * Caches TermEntries from a single field. Materializes all values in the TermEnum to memory (much like FieldCache)
- */
-public class CachedTermInfo implements TermInfo {
-
- private final Map<String, TermEntry> termEntries;
- private final String field;
-
- public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
- this.field = field;
- Terms t = MultiFields.getTerms(reader, field);
- TermsEnum te = t.iterator();
-
- int numDocs = reader.numDocs();
- double percent = numDocs * maxDfPercent / 100.0;
- //Should we use a linked hash map so that we know terms are in order?
- termEntries = new LinkedHashMap<>();
- int count = 0;
- BytesRef text;
- while ((text = te.next()) != null) {
- int df = te.docFreq();
- if (df >= minDf && df <= percent) {
- TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
- termEntries.put(entry.getTerm(), entry);
- }
- }
- }
-
- @Override
- public int totalTerms(String field) {
- return termEntries.size();
- }
-
- @Override
- public TermEntry getTermEntry(String field, String term) {
- if (!this.field.equals(field)) {
- return null;
- }
- return termEntries.get(term);
- }
-
- @Override
- public Iterator<TermEntry> getAllEntries() {
- return termEntries.values().iterator();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
deleted file mode 100644
index b2568e7..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
+++ /dev/null
@@ -1,381 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.file.Paths;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.stats.LogLikelihood;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Get labels for the cluster using Log Likelihood Ratio (LLR).
- * 
- *"The most useful way to think of this (LLR) is as the percentage of in-cluster documents that have the
- * feature (term) versus the percentage out, keeping in mind that both percentages are uncertain since we have
- * only a sample of all possible documents." - Ted Dunning
- * 
- * More about LLR can be found at : http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
- */
-public class ClusterLabels {
-
- private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class);
-
- public static final int DEFAULT_MIN_IDS = 50;
- public static final int DEFAULT_MAX_LABELS = 25;
-
- private final String indexDir;
- private final String contentField;
- private String idField;
- private final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
- private String output;
- private final int minNumIds;
- private final int maxLabels;
-
- public ClusterLabels(Path seqFileDir,
- Path pointsDir,
- String indexDir,
- String contentField,
- int minNumIds,
- int maxLabels) {
- this.indexDir = indexDir;
- this.contentField = contentField;
- this.minNumIds = minNumIds;
- this.maxLabels = maxLabels;
- ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
- this.clusterIdToPoints = clusterDumper.getClusterIdToPoints();
- }
-
- public void getLabels() throws IOException {
-
- try (Writer writer = (this.output == null) ?
- new OutputStreamWriter(System.out, Charsets.UTF_8) : Files.newWriter(new File(this.output), Charsets.UTF_8)){
- for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) {
- List<WeightedPropertyVectorWritable> wpvws = integerListEntry.getValue();
- List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wpvws);
- if (termInfos != null) {
- writer.write('\n');
- writer.write("Top labels for Cluster ");
- writer.write(String.valueOf(integerListEntry.getKey()));
- writer.write(" containing ");
- writer.write(String.valueOf(wpvws.size()));
- writer.write(" vectors");
- writer.write('\n');
- writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
- writer.write('\n');
- for (TermInfoClusterInOut termInfo : termInfos) {
- writer.write(termInfo.getTerm());
- writer.write("\t\t");
- writer.write(String.valueOf(termInfo.getLogLikelihoodRatio()));
- writer.write("\t\t");
- writer.write(String.valueOf(termInfo.getInClusterDF()));
- writer.write("\t\t");
- writer.write(String.valueOf(termInfo.getOutClusterDF()));
- writer.write('\n');
- }
- }
- }
- }
- }
-
- /**
- * Get the list of labels, sorted by best score.
- */
- protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
- Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {
-
- if (wpvws.size() < minNumIds) {
- log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
- return null;
- }
-
- log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
- Directory dir = FSDirectory.open(Paths.get(this.indexDir));
- IndexReader reader = DirectoryReader.open(dir);
-
-
- log.info("# of documents in the index {}", reader.numDocs());
-
- Collection<String> idSet = new HashSet<>();
- for (WeightedPropertyVectorWritable wpvw : wpvws) {
- Vector vector = wpvw.getVector();
- if (vector instanceof NamedVector) {
- idSet.add(((NamedVector) vector).getName());
- }
- }
-
- int numDocs = reader.numDocs();
-
- FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
-
- log.info("Populating term infos from the index");
-
- /**
- * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
- *
- * Since we have deleted the documents out of the cluster, the document frequency for a term should only
- * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
- * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
- * frequencies in each document. The number of results of this call will be the in-cluster document
- * frequency.
- */
- Terms t = MultiFields.getTerms(reader, contentField);
- TermsEnum te = t.iterator();
- Map<String, TermEntry> termEntryMap = new LinkedHashMap<>();
- Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions
-
-
- int count = 0;
- BytesRef term;
- while ((term = te.next()) != null) {
- FixedBitSet termBitset = new FixedBitSet(reader.maxDoc());
- PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, contentField, term);
- int docID;
- while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
- //check to see if we don't have an deletions (null) or if document is live
- if (liveDocs != null && !liveDocs.get(docID)) {
- // document is deleted...
- termBitset.set(docsEnum.docID());
- }
- }
- // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
- // This modifies the termBitset, but that's fine as we are not using it anywhere else.
- termBitset.and(clusterDocBitset);
- int inclusterDF = (int) termBitset.cardinality();
-
- TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
- termEntryMap.put(entry.getTerm(), entry);
-
- }
-
- List<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<>();
-
- int clusterSize = wpvws.size();
-
- for (TermEntry termEntry : termEntryMap.values()) {
-
- int corpusDF = reader.docFreq(new Term(this.contentField,termEntry.getTerm()));
- int outDF = corpusDF - termEntry.getDocFreq();
- int inDF = termEntry.getDocFreq();
- double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
- TermInfoClusterInOut termInfoCluster =
- new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio);
- clusteredTermInfo.add(termInfoCluster);
- }
-
- Collections.sort(clusteredTermInfo);
- // Cleanup
- Closeables.close(reader, true);
- termEntryMap.clear();
-
- return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
- }
-
- private static FixedBitSet getClusterDocBitset(IndexReader reader,
- Collection<String> idSet,
- String idField) throws IOException {
- int numDocs = reader.numDocs();
-
- FixedBitSet bitset = new FixedBitSet(numDocs);
-
- Set<String> idFieldSelector = null;
- if (idField != null) {
- idFieldSelector = new TreeSet<>();
- idFieldSelector.add(idField);
- }
-
-
- for (int i = 0; i < numDocs; i++) {
- String id;
- // Use Lucene's internal ID if idField is not specified. Else, get it from the document.
- if (idField == null) {
- id = Integer.toString(i);
- } else {
- id = reader.document(i, idFieldSelector).get(idField);
- }
- if (idSet.contains(id)) {
- bitset.set(i);
- }
- }
- log.info("Created bitset for in-cluster documents : {}", bitset.cardinality());
- return bitset;
- }
-
- private static double scoreDocumentFrequencies(long inDF, long outDF, long clusterSize, long corpusSize) {
- long k12 = clusterSize - inDF;
- long k22 = corpusSize - clusterSize - outDF;
-
- return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
- }
-
- public String getIdField() {
- return idField;
- }
-
- public void setIdField(String idField) {
- this.idField = idField;
- }
-
- public String getOutput() {
- return output;
- }
-
- public void setOutput(String output) {
- this.output = output;
- }
-
- public static void main(String[] args) {
-
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option indexOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
- abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
- .withDescription("The Lucene index directory").withShortName("d").create();
-
- Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output file. If not specified, the result is printed on console.").withShortName("o").create();
-
- Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
- abuilder.withName("field").withMinimum(1).withMaximum(1).create())
- .withDescription("The content field in the index").withShortName("f").create();
-
- Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
- abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
- "The field for the document ID in the index. If null, then the Lucene internal doc "
- + "id is used which is prone to error if the underlying index changes").withShortName("i").create();
-
- Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(true).withArgument(
- abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription(
- "The directory containing Sequence Files for the Clusters").withShortName("s").create();
-
- Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true).withArgument(
- abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription(
- "The directory containing points sequence files mapping input vectors to their cluster. ")
- .withShortName("p").create();
- Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false).withArgument(
- abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription(
- "The minimum number of points required in a cluster to print the labels for").withShortName("m").create();
- Option maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false).withArgument(
- abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).withDescription(
- "The maximum number of labels to print per cluster").withShortName("x").create();
- Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt)
- .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt)
- .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
-
- Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString());
- Path pointsDir = new Path(cmdLine.getValue(pointsOpt).toString());
- String indexDir = cmdLine.getValue(indexOpt).toString();
- String contentField = cmdLine.getValue(fieldOpt).toString();
-
- String idField = null;
-
- if (cmdLine.hasOption(idFieldOpt)) {
- idField = cmdLine.getValue(idFieldOpt).toString();
- }
- String output = null;
- if (cmdLine.hasOption(outputOpt)) {
- output = cmdLine.getValue(outputOpt).toString();
- }
- int maxLabels = DEFAULT_MAX_LABELS;
- if (cmdLine.hasOption(maxLabelsOpt)) {
- maxLabels = Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString());
- }
- int minSize = DEFAULT_MIN_IDS;
- if (cmdLine.hasOption(minClusterSizeOpt)) {
- minSize = Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString());
- }
- ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField, minSize, maxLabels);
-
- if (idField != null) {
- clusterLabel.setIdField(idField);
- }
- if (output != null) {
- clusterLabel.setOutput(output);
- }
-
- clusterLabel.getLabels();
-
- } catch (OptionException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- } catch (IOException e) {
- log.error("Exception", e);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
deleted file mode 100644
index 876816f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
+++ /dev/null
@@ -1,349 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Writer;
-import java.nio.file.Paths;
-import java.util.Iterator;
-
-import com.google.common.base.Preconditions;
-import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.io.DelimitedTermInfoWriter;
-import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
-import org.apache.mahout.utils.vectors.io.VectorWriter;
-import org.apache.mahout.vectorizer.TF;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Driver {
-
- private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
- private String luceneDir;
- private String outFile;
- private String field;
- private String idField;
- private String dictOut;
- private String seqDictOut = "";
- private String weightType = "tfidf";
- private String delimiter = "\t";
- private double norm = LuceneIterable.NO_NORMALIZING;
- private long maxDocs = Long.MAX_VALUE;
- private int minDf = 1;
- private int maxDFPercent = 99;
- private double maxPercentErrorDocs = 0.0;
-
- public void dumpVectors() throws IOException {
-
- File file = new File(luceneDir);
- Preconditions.checkArgument(file.isDirectory(),
- "Lucene directory: " + file.getAbsolutePath()
- + " does not exist or is not a directory");
- Preconditions.checkArgument(maxDocs >= 0, "maxDocs must be >= 0");
- Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1");
- Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99");
-
- Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath()));
- IndexReader reader = DirectoryReader.open(dir);
-
-
- Weight weight;
- if ("tf".equalsIgnoreCase(weightType)) {
- weight = new TF();
- } else if ("tfidf".equalsIgnoreCase(weightType)) {
- weight = new TFIDF();
- } else {
- throw new IllegalArgumentException("Weight type " + weightType + " is not supported");
- }
-
- TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
-
- LuceneIterable iterable;
- if (norm == LuceneIterable.NO_NORMALIZING) {
- iterable = new LuceneIterable(reader, idField, field, termInfo, weight, LuceneIterable.NO_NORMALIZING,
- maxPercentErrorDocs);
- } else {
- iterable = new LuceneIterable(reader, idField, field, termInfo, weight, norm, maxPercentErrorDocs);
- }
-
- log.info("Output File: {}", outFile);
-
- try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
- long numDocs = vectorWriter.write(iterable, maxDocs);
- log.info("Wrote: {} vectors", numDocs);
- }
-
- File dictOutFile = new File(dictOut);
- log.info("Dictionary Output file: {}", dictOutFile);
- Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
- try (DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field)) {
- tiWriter.write(termInfo);
- }
-
- if (!"".equals(seqDictOut)) {
- log.info("SequenceFile Dictionary Output file: {}", seqDictOut);
-
- Path path = new Path(seqDictOut);
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- try (SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class)) {
- Text term = new Text();
- IntWritable termIndex = new IntWritable();
- Iterator<TermEntry> termEntries = termInfo.getAllEntries();
- while (termEntries.hasNext()) {
- TermEntry termEntry = termEntries.next();
- term.set(termEntry.getTerm());
- termIndex.set(termEntry.getTermIdx());
- seqWriter.append(term, termIndex);
- }
- }
- }
- }
-
- public static void main(String[] args) throws IOException {
-
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
- abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
- .withDescription("The Lucene directory").withShortName("d").create();
-
- Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file")
- .withShortName("o").create();
-
- Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
- abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
- "The field in the index").withShortName("f").create();
-
- Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
- abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
- "The field in the index containing the index. If null, then the Lucene internal doc "
- + "id is used which is prone to error if the underlying index changes").create();
-
- Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
- abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output of the dictionary").withShortName("t").create();
-
- Option seqDictOutOpt = obuilder.withLongName("seqDictOut").withRequired(false).withArgument(
- abuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output of the dictionary as sequence file").withShortName("st").create();
-
- Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
- abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
- "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
-
- Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
- abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
- "The delimiter for outputting the dictionary").withShortName("l").create();
-
- Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
- abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
- "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm. "
- + "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create();
-
- Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
- abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
- "The maximum number of vectors to output. If not specified, then it will loop over all docs")
- .withShortName("m").create();
-
- Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
- abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
- "The minimum document frequency. Default is 1").withShortName("md").create();
-
- Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
- abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
- "The max percentage of docs for the DF. Can be used to remove really high frequency terms."
- + " Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
-
- Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
- abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
- "The max percentage of docs that can have a null term vector. These are noise document and can occur if the "
- + "analyzer used strips out all terms in the target field. This percentage is expressed as a value "
- + "between 0 and 1. The default is 0.").withShortName("err").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
- .create();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
- outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
- .withOption(dictOutOpt).withOption(seqDictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
- .withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
-
- CommandLineUtil.printHelp(group);
- return;
- }
-
- if (cmdLine.hasOption(inputOpt)) { // Lucene case
- Driver luceneDriver = new Driver();
- luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString());
-
- if (cmdLine.hasOption(maxOpt)) {
- luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString()));
- }
-
- if (cmdLine.hasOption(weightOpt)) {
- luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString());
- }
-
- luceneDriver.setField(cmdLine.getValue(fieldOpt).toString());
-
- if (cmdLine.hasOption(minDFOpt)) {
- luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString()));
- }
-
- if (cmdLine.hasOption(maxDFPercentOpt)) {
- luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()));
- }
-
- if (cmdLine.hasOption(powerOpt)) {
- String power = cmdLine.getValue(powerOpt).toString();
- if ("INF".equals(power)) {
- luceneDriver.setNorm(Double.POSITIVE_INFINITY);
- } else {
- luceneDriver.setNorm(Double.parseDouble(power));
- }
- }
-
- if (cmdLine.hasOption(idFieldOpt)) {
- luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString());
- }
-
- if (cmdLine.hasOption(maxPercentErrorDocsOpt)) {
- luceneDriver.setMaxPercentErrorDocs(Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString()));
- }
-
- luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString());
-
- luceneDriver.setDelimiter(cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t");
-
- luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString());
-
- if (cmdLine.hasOption(seqDictOutOpt)) {
- luceneDriver.setSeqDictOut(cmdLine.getValue(seqDictOutOpt).toString());
- }
-
- luceneDriver.dumpVectors();
- }
- } catch (OptionException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- }
- }
-
- private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
- Path path = new Path(outFile);
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- // TODO: Make this parameter driven
-
- SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
- VectorWritable.class);
-
- return new SequenceFileVectorWriter(seqWriter);
- }
-
- public void setLuceneDir(String luceneDir) {
- this.luceneDir = luceneDir;
- }
-
- public void setMaxDocs(long maxDocs) {
- this.maxDocs = maxDocs;
- }
-
- public void setWeightType(String weightType) {
- this.weightType = weightType;
- }
-
- public void setField(String field) {
- this.field = field;
- }
-
- public void setMinDf(int minDf) {
- this.minDf = minDf;
- }
-
- public void setMaxDFPercent(int maxDFPercent) {
- this.maxDFPercent = maxDFPercent;
- }
-
- public void setNorm(double norm) {
- this.norm = norm;
- }
-
- public void setIdField(String idField) {
- this.idField = idField;
- }
-
- public void setOutFile(String outFile) {
- this.outFile = outFile;
- }
-
- public void setDelimiter(String delimiter) {
- this.delimiter = delimiter;
- }
-
- public void setDictOut(String dictOut) {
- this.dictOut = dictOut;
- }
-
- public void setSeqDictOut(String seqDictOut) {
- this.seqDictOut = seqDictOut;
- }
-
- public void setMaxPercentErrorDocs(double maxPercentErrorDocs) {
- this.maxPercentErrorDocs = maxPercentErrorDocs;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
deleted file mode 100644
index 1af0ed0..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-import java.util.Iterator;
-
-/**
- * {@link Iterable} counterpart to {@link LuceneIterator}.
- */
-public final class LuceneIterable implements Iterable<Vector> {
-
- public static final double NO_NORMALIZING = -1.0;
-
- private final IndexReader indexReader;
- private final String field;
- private final String idField;
- private final TermInfo terminfo;
- private final double normPower;
- private final double maxPercentErrorDocs;
- private final Weight weight;
-
- public LuceneIterable(IndexReader reader, String idField, String field, TermInfo terminfo, Weight weight) {
- this(reader, idField, field, terminfo, weight, NO_NORMALIZING);
- }
-
- public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight,
- double normPower) {
- this(indexReader, idField, field, terminfo, weight, normPower, 0);
- }
-
- /**
- * Produce a LuceneIterable that can create the Vector plus normalize it.
- *
- * @param indexReader {@link org.apache.lucene.index.IndexReader} to read the documents from.
- * @param idField field containing the id. May be null.
- * @param field field to use for the Vector
- * @param normPower the normalization value. Must be nonnegative, or {@link #NO_NORMALIZING}
- * @param maxPercentErrorDocs the percentage of documents in the lucene index that can have a null term vector
- */
- public LuceneIterable(IndexReader indexReader,
- String idField,
- String field,
- TermInfo terminfo,
- Weight weight,
- double normPower,
- double maxPercentErrorDocs) {
- this.indexReader = indexReader;
- this.idField = idField;
- this.field = field;
- this.terminfo = terminfo;
- this.normPower = normPower;
- this.maxPercentErrorDocs = maxPercentErrorDocs;
- this.weight = weight;
- }
-
- @Override
- public Iterator<Vector> iterator() {
- return new LuceneIterator(indexReader, idField, field, terminfo, weight, normPower, maxPercentErrorDocs);
- }
-}

r***@apache.org

2018-06-27 14:51:39 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Matrices.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Matrices.java b/math/src/main/java/org/apache/mahout/math/Matrices.java
deleted file mode 100644
index 5d8b5c5..0000000
--- a/math/src/main/java/org/apache/mahout/math/Matrices.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.math.function.IntIntFunction;
-
-import java.util.Random;
-
-public final class Matrices {
-
- /**
- * Create a matrix view based on a function generator.
- * 
- * The generator needs to be idempotent, i.e. returning same value
- * for each combination of (row, column) argument sent to generator's
- * {@link IntIntFunction#apply(int, int)} call.
- *
- * @param rows Number of rows in a view
- * @param columns Number of columns in a view.
- * @param gf view generator
- * @param denseLike type of matrix returne dby {@link org.apache.mahout.math.Matrix#like()}.
- * @return new matrix view.
- */
- public static Matrix functionalMatrixView(final int rows,
- final int columns,
- final IntIntFunction gf,
- final boolean denseLike) {
- return new FunctionalMatrixView(rows, columns, gf, denseLike);
- }
-
- /**
- * Shorter form of {@link Matrices#functionalMatrixView(int, int,
- * org.apache.mahout.math.function.IntIntFunction, boolean)}.
- */
- public static Matrix functionalMatrixView(final int rows,
- final int columns,
- final IntIntFunction gf) {
- return new FunctionalMatrixView(rows, columns, gf);
- }
-
- /**
- * A read-only transposed view of a matrix argument.
- *
- * @param m original matrix
- * @return transposed view of original matrix
- */
- public static Matrix transposedView(final Matrix m) {
-
- Preconditions.checkArgument(!(m instanceof SparseColumnMatrix));
-
- if (m instanceof TransposedMatrixView) {
- return ((TransposedMatrixView) m).getDelegate();
- } else {
- return new TransposedMatrixView(m);
- }
- }
-
- /**
- * Random Gaussian matrix view.
- *
- * @param seed generator seed
- */
- public static Matrix gaussianView(final int rows,
- final int columns,
- long seed) {
- return functionalMatrixView(rows, columns, gaussianGenerator(seed), true);
- }
-
-
- /**
- * Matrix view based on uniform [-1,1) distribution.
- *
- * @param seed generator seed
- */
- public static Matrix symmetricUniformView(final int rows,
- final int columns,
- int seed) {
- return functionalMatrixView(rows, columns, uniformSymmetricGenerator(seed), true);
- }
-
- /**
- * Matrix view based on uniform [0,1) distribution.
- *
- * @param seed generator seed
- */
- public static Matrix uniformView(final int rows,
- final int columns,
- int seed) {
- return functionalMatrixView(rows, columns, uniformGenerator(seed), true);
- }
-
- /**
- * Generator for a matrix populated by random Gauissian values (Gaussian matrix view)
- *
- * @param seed The seed for the matrix.
- * @return Gaussian {@link IntIntFunction} generating matrix view with normal values
- */
- public static IntIntFunction gaussianGenerator(final long seed) {
- final Random rnd = RandomUtils.getRandom(seed);
- return new IntIntFunction() {
- @Override
- public double apply(int first, int second) {
- rnd.setSeed(seed ^ (((long) first << 32) | (second & 0xffffffffL)));
- return rnd.nextGaussian();
- }
- };
- }
-
- private static final double UNIFORM_DIVISOR = Math.pow(2.0, 64);
-
- /**
- * Uniform [-1,1) matrix generator function.
- * 
- * WARNING: to keep things performant, it is stateful and so not thread-safe.
- * You'd need to create a copy per thread (with same seed) if shared between threads.
- *
- * @param seed - random seed initializer
- * @return Uniform {@link IntIntFunction} generator
- */
- public static IntIntFunction uniformSymmetricGenerator(final int seed) {
- return new IntIntFunction() {
- private byte[] data = new byte[8];
-
- @Override
- public double apply(int row, int column) {
- long d = ((long) row << Integer.SIZE) | (column & 0xffffffffL);
- for (int i = 0; i < 8; i++, d >>>= 8) data[i] = (byte) d;
- long hash = MurmurHash.hash64A(data, seed);
- return hash / UNIFORM_DIVISOR;
- }
- };
- }
-
- /**
- * Uniform [0,1) matrix generator function
- *
- * @param seed generator seed
- */
- public static IntIntFunction uniformGenerator(final int seed) {
- return Functions.chain(new DoubleFunction() {
- @Override
- public double apply(double x) {
- return (x + 1.0) / 2.0;
- }
- }, uniformSymmetricGenerator(seed));
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/Matrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Matrix.java b/math/src/main/java/org/apache/mahout/math/Matrix.java
deleted file mode 100644
index 57fab78..0000000
--- a/math/src/main/java/org/apache/mahout/math/Matrix.java
+++ /dev/null
@@ -1,413 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.MatrixFlavor;
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.VectorFunction;
-
-import java.util.Map;
-
-/** The basic interface including numerous convenience functions */
-public interface Matrix extends Cloneable, VectorIterable {
-
- /** @return a formatted String suitable for output */
- String asFormatString();
-
- /**
- * Assign the value to all elements of the receiver
- *
- * @param value a double value
- * @return the modified receiver
- */
- Matrix assign(double value);
-
- /**
- * Assign the values to the receiver
- *
- * @param values a double[] of values
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Matrix assign(double[][] values);
-
- /**
- * Assign the other vector values to the receiver
- *
- * @param other a Matrix
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Matrix assign(Matrix other);
-
- /**
- * Apply the function to each element of the receiver
- *
- * @param function a DoubleFunction to apply
- * @return the modified receiver
- */
- Matrix assign(DoubleFunction function);
-
- /**
- * Apply the function to each element of the receiver and the corresponding element of the other argument
- *
- * @param other a Matrix containing the second arguments to the function
- * @param function a DoubleDoubleFunction to apply
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Matrix assign(Matrix other, DoubleDoubleFunction function);
-
- /**
- * Assign the other vector values to the column of the receiver
- *
- * @param column the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Matrix assignColumn(int column, Vector other);
-
- /**
- * Assign the other vector values to the row of the receiver
- *
- * @param row the int row to assign
- * @param other a Vector
- * @return the modified receiver
- * @throws CardinalityException if the cardinalities differ
- */
- Matrix assignRow(int row, Vector other);
-
- /**
- * Collects the results of a function applied to each row of a matrix.
- * @param f The function to be applied to each row.
- * @return The vector of results.
- */
- Vector aggregateRows(VectorFunction f);
-
- /**
- * Collects the results of a function applied to each column of a matrix.
- * @param f The function to be applied to each column.
- * @return The vector of results.
- */
- Vector aggregateColumns(VectorFunction f);
-
- /**
- * Collects the results of a function applied to each element of a matrix and then
- * aggregated.
- * @param combiner A function that combines the results of the mapper.
- * @param mapper A function to apply to each element.
- * @return The result.
- */
- double aggregate(DoubleDoubleFunction combiner, DoubleFunction mapper);
-
- /**
- * @return The number of rows in the matrix.
- */
- int columnSize();
-
- /**
- * @return Returns the number of rows in the matrix.
- */
- int rowSize();
-
- /**
- * Return a copy of the recipient
- *
- * @return a new Matrix
- */
- Matrix clone();
-
- /**
- * Returns matrix determinator using Laplace theorem
- *
- * @return a matrix determinator
- */
- double determinant();
-
- /**
- * Return a new matrix containing the values of the recipient divided by the argument
- *
- * @param x a double value
- * @return a new Matrix
- */
- Matrix divide(double x);
-
- /**
- * Return the value at the given indexes
- *
- * @param row an int row index
- * @param column an int column index
- * @return the double at the index
- * @throws IndexException if the index is out of bounds
- */
- double get(int row, int column);
-
- /**
- * Return the value at the given indexes, without checking bounds
- *
- * @param row an int row index
- * @param column an int column index
- * @return the double at the index
- */
- double getQuick(int row, int column);
-
- /**
- * Return an empty matrix of the same underlying class as the receiver
- *
- * @return a Matrix
- */
- Matrix like();
-
- /**
- * Returns an empty matrix of the same underlying class as the receiver and of the specified size.
- *
- * @param rows the int number of rows
- * @param columns the int number of columns
- */
- Matrix like(int rows, int columns);
-
- /**
- * Return a new matrix containing the element by element difference of the recipient and the argument
- *
- * @param x a Matrix
- * @return a new Matrix
- * @throws CardinalityException if the cardinalities differ
- */
- Matrix minus(Matrix x);
-
- /**
- * Return a new matrix containing the sum of each value of the recipient and the argument
- *
- * @param x a double
- * @return a new Matrix
- */
- Matrix plus(double x);
-
- /**
- * Return a new matrix containing the element by element sum of the recipient and the argument
- *
- * @param x a Matrix
- * @return a new Matrix
- * @throws CardinalityException if the cardinalities differ
- */
- Matrix plus(Matrix x);
-
- /**
- * Set the value at the given index
- *
- * @param row an int row index into the receiver
- * @param column an int column index into the receiver
- * @param value a double value to set
- * @throws IndexException if the index is out of bounds
- */
- void set(int row, int column, double value);
-
- void set(int row, double[] data);
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param row an int row index into the receiver
- * @param column an int column index into the receiver
- * @param value a double value to set
- */
- void setQuick(int row, int column, double value);
-
- /**
- * Return the number of values in the recipient
- *
- * @return an int[2] containing [row, column] count
- */
- int[] getNumNondefaultElements();
-
- /**
- * Return a new matrix containing the product of each value of the recipient and the argument
- *
- * @param x a double argument
- * @return a new Matrix
- */
- Matrix times(double x);
-
- /**
- * Return a new matrix containing the product of the recipient and the argument
- *
- * @param x a Matrix argument
- * @return a new Matrix
- * @throws CardinalityException if the cardinalities are incompatible
- */
- Matrix times(Matrix x);
-
- /**
- * Return a new matrix that is the transpose of the receiver
- *
- * @return the transpose
- */
- Matrix transpose();
-
- /**
- * Return the sum of all the elements of the receiver
- *
- * @return a double
- */
- double zSum();
-
- /**
- * Return a map of the current column label bindings of the receiver
- *
- * @return a {@code Map<String, Integer>}
- */
- Map<String, Integer> getColumnLabelBindings();
-
- /**
- * Return a map of the current row label bindings of the receiver
- *
- * @return a {@code Map<String, Integer>}
- */
- Map<String, Integer> getRowLabelBindings();
-
- /**
- * Sets a map of column label bindings in the receiver
- *
- * @param bindings a {@code Map<String, Integer>} of label bindings
- */
- void setColumnLabelBindings(Map<String, Integer> bindings);
-
- /**
- * Sets a map of row label bindings in the receiver
- *
- * @param bindings a {@code Map<String, Integer>} of label bindings
- */
- void setRowLabelBindings(Map<String, Integer> bindings);
-
- /**
- * Return the value at the given labels
- *
- * @param rowLabel a String row label
- * @param columnLabel a String column label
- * @return the double at the index
- *
- * @throws IndexException if the index is out of bounds
- */
- double get(String rowLabel, String columnLabel);
-
- /**
- * Set the value at the given index
- *
- * @param rowLabel a String row label
- * @param columnLabel a String column label
- * @param value a double value to set
- * @throws IndexException if the index is out of bounds
- */
- void set(String rowLabel, String columnLabel, double value);
-
- /**
- * Set the value at the given index, updating the row and column label bindings
- *
- * @param rowLabel a String row label
- * @param columnLabel a String column label
- * @param row an int row index
- * @param column an int column index
- * @param value a double value
- */
- void set(String rowLabel, String columnLabel, int row, int column, double value);
-
- /**
- * Sets the row values at the given row label
- *
- * @param rowLabel a String row label
- * @param rowData a double[] array of row data
- */
- void set(String rowLabel, double[] rowData);
-
- /**
- * Sets the row values at the given row index and updates the row labels
- *
- * @param rowLabel the String row label
- * @param row an int the row index
- * @param rowData a double[] array of row data
- */
- void set(String rowLabel, int row, double[] rowData);
-
- /*
- * Need stories for these but keeping them here for now.
- *
- */
- // void getNonZeros(IntArrayList jx, DoubleArrayList values);
- // void foreachNonZero(IntDoubleFunction f);
- // double aggregate(DoubleDoubleFunction aggregator, DoubleFunction map);
- // double aggregate(Matrix other, DoubleDoubleFunction aggregator,
- // DoubleDoubleFunction map);
- // NewMatrix assign(Matrix y, DoubleDoubleFunction function, IntArrayList
- // nonZeroIndexes);
-
- /**
- * Return a view into part of a matrix. Changes to the view will change the
- * original matrix.
- *
- * @param offset an int[2] offset into the receiver
- * @param size the int[2] size of the desired result
- * @return a matrix that shares storage with part of the original matrix.
- * @throws CardinalityException if the length is greater than the cardinality of the receiver
- * @throws IndexException if the offset is negative or the offset+length is outside of the receiver
- */
- Matrix viewPart(int[] offset, int[] size);
-
- /**
- * Return a view into part of a matrix. Changes to the view will change the
- * original matrix.
- *
- * @param rowOffset The first row of the view
- * @param rowsRequested The number of rows in the view
- * @param columnOffset The first column in the view
- * @param columnsRequested The number of columns in the view
- * @return a matrix that shares storage with part of the original matrix.
- * @throws CardinalityException if the length is greater than the cardinality of the receiver
- * @throws IndexException if the offset is negative or the offset+length is outside of the
- * receiver
- */
- Matrix viewPart(int rowOffset, int rowsRequested, int columnOffset, int columnsRequested);
-
- /**
- * Return a reference to a row. Changes to the view will change the original matrix.
- * @param row The index of the row to return.
- * @return A vector that shares storage with the original.
- */
- Vector viewRow(int row);
-
- /**
- * Return a reference to a column. Changes to the view will change the original matrix.
- * @param column The index of the column to return.
- * @return A vector that shares storage with the original.
- */
- Vector viewColumn(int column);
-
- /**
- * Returns a reference to the diagonal of a matrix. Changes to the view will change
- * the original matrix.
- * @return A vector that shares storage with the original matrix.
- */
- Vector viewDiagonal();
-
- /**
- * Get matrix structural flavor (operations performance hints). This is optional operation, may
- * throw {@link java.lang.UnsupportedOperationException}.
- */
- MatrixFlavor getFlavor();
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/MatrixSlice.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixSlice.java b/math/src/main/java/org/apache/mahout/math/MatrixSlice.java
deleted file mode 100644
index 51378c1..0000000
--- a/math/src/main/java/org/apache/mahout/math/MatrixSlice.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-public class MatrixSlice extends DelegatingVector {
- private int index;
-
- public MatrixSlice(Vector v, int index) {
- super(v);
- this.index = index;
- }
-
- public Vector vector() {
- return getVector();
- }
-
- public int index() {
- return index;
- }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/MatrixTimesOps.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixTimesOps.java b/math/src/main/java/org/apache/mahout/math/MatrixTimesOps.java
deleted file mode 100644
index 30d2afb..0000000
--- a/math/src/main/java/org/apache/mahout/math/MatrixTimesOps.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-/**
- * Optional interface for optimized matrix multiplications.
- * Some concrete Matrix implementations may mix this in.
- */
-public interface MatrixTimesOps {
- /**
- * computes matrix product of (this * that)
- */
- Matrix timesRight(Matrix that);
-
- /**
- * Computes matrix product of (that * this)
- */
- Matrix timesLeft(Matrix that);
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java b/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
deleted file mode 100644
index 6ad44b5..0000000
--- a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Iterator;
-import java.util.NoSuchElementException;
-
-/**
- * Provides a virtual vector that is really a row or column or diagonal of a matrix.
- */
-public class MatrixVectorView extends AbstractVector {
- private Matrix matrix;
- private int row;
- private int column;
- private int rowStride;
- private int columnStride;
- private boolean isDense = true;
-
- public MatrixVectorView(Matrix matrix, int row, int column, int rowStride, int columnStride, boolean isDense) {
- this(matrix, row, column, rowStride, columnStride);
- this.isDense = isDense;
- }
-
- public MatrixVectorView(Matrix matrix, int row, int column, int rowStride, int columnStride) {
- super(viewSize(matrix, row, column, rowStride, columnStride));
- if (row < 0 || row >= matrix.rowSize()) {
- throw new IndexException(row, matrix.rowSize());
- }
- if (column < 0 || column >= matrix.columnSize()) {
- throw new IndexException(column, matrix.columnSize());
- }
-
- this.matrix = matrix;
- this.row = row;
- this.column = column;
- this.rowStride = rowStride;
- this.columnStride = columnStride;
- }
-
- private static int viewSize(Matrix matrix, int row, int column, int rowStride, int columnStride) {
- if (rowStride != 0 && columnStride != 0) {
- int n1 = (matrix.numRows() - row) / rowStride;
- int n2 = (matrix.numCols() - column) / columnStride;
- return Math.min(n1, n2);
- } else if (rowStride > 0) {
- return (matrix.numRows() - row) / rowStride;
- } else {
- return (matrix.numCols() - column) / columnStride;
- }
- }
-
- /**
- * @return true iff the {@link Vector} implementation should be considered
- * dense -- that it explicitly represents every value
- */
- @Override
- public boolean isDense() {
- return isDense;
- }
-
- /**
- * @return true iff {@link Vector} should be considered to be iterable in
- * index order in an efficient way. In particular this implies that {@link #iterator()} and
- * {@link #iterateNonZero()} return elements in ascending order by index.
- */
- @Override
- public boolean isSequentialAccess() {
- return true;
- }
-
- /**
- * Iterates over all elements 
- * NOTE: Implementations may choose to reuse the Element returned
- * for performance reasons, so if you need a copy of it, you should call {@link #getElement(int)} for
- * the given index
- *
- * @return An {@link java.util.Iterator} over all elements
- */
- @Override
- public Iterator<Element> iterator() {
- final LocalElement r = new LocalElement(0);
- return new Iterator<Element>() {
- private int i;
-
- @Override
- public boolean hasNext() {
- return i < size();
- }
-
- @Override
- public Element next() {
- if (i >= size()) {
- throw new NoSuchElementException();
- }
- r.index = i++;
- return r;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException("Can't remove from a view");
- }
- };
- }
-
- /**
- * Iterates over all non-zero elements. 
- * NOTE: Implementations may choose to reuse the Element
- * returned for performance reasons, so if you need a copy of it, you should call {@link
- * #getElement(int)} for the given index
- *
- * @return An {@link java.util.Iterator} over all non-zero elements
- */
- @Override
- public Iterator<Element> iterateNonZero() {
-
- return new Iterator<Element>() {
- class NonZeroElement implements Element {
- int index;
-
- @Override
- public double get() {
- return getQuick(index);
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set(double value) {
- invalidateCachedLength();
- setQuick(index, value);
- }
- }
-
- private final NonZeroElement element = new NonZeroElement();
- private int index = -1;
- private int lookAheadIndex = -1;
-
- @Override
- public boolean hasNext() {
- if (lookAheadIndex == index) { // User calls hasNext() after a next()
- lookAhead();
- } // else user called hasNext() repeatedly.
- return lookAheadIndex < size();
- }
-
- private void lookAhead() {
- lookAheadIndex++;
- while (lookAheadIndex < size() && getQuick(lookAheadIndex) == 0.0) {
- lookAheadIndex++;
- }
- }
-
- @Override
- public Element next() {
- if (lookAheadIndex == index) { // If user called next() without checking hasNext().
- lookAhead();
- }
-
- index = lookAheadIndex;
-
- if (index >= size()) { // If the end is reached.
- throw new NoSuchElementException();
- }
-
- element.index = index;
- return element;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- };
- }
-
- /**
- * Return the value at the given index, without checking bounds
- *
- * @param index an int index
- * @return the double at the index
- */
- @Override
- public double getQuick(int index) {
- return matrix.getQuick(row + rowStride * index, column + columnStride * index);
- }
-
- /**
- * Return an empty vector of the same underlying class as the receiver
- *
- * @return a Vector
- */
- @Override
- public Vector like() {
- return matrix.like(size(), 1).viewColumn(0);
- }
-
- @Override
- public Vector like(int cardinality) {
- return matrix.like(cardinality, 1).viewColumn(0);
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param index an int index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int index, double value) {
- matrix.setQuick(row + rowStride * index, column + columnStride * index, value);
- }
-
- /**
- * Return the number of values in the recipient
- *
- * @return an int
- */
- @Override
- public int getNumNondefaultElements() {
- return size();
- }
-
- @Override
- public double getLookupCost() {
- // TODO: what is a genuine value here?
- return 1;
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- // TODO: what is a genuine value here?
- return 1;
- }
-
- @Override
- public boolean isAddConstantTime() {
- // TODO: what is a genuine value here?
- return true;
- }
-
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- return matrix.like(rows, columns);
- }
-
- @Override
- public Vector clone() {
- MatrixVectorView r = (MatrixVectorView) super.clone();
- r.matrix = matrix.clone();
- r.row = row;
- r.column = column;
- r.rowStride = rowStride;
- r.columnStride = columnStride;
- return r;
- }
-
- /**
- * Used internally by assign() to update multiple indices and values at once.
- * Only really useful for sparse vectors (especially SequentialAccessSparseVector).
- * 
- * If someone ever adds a new type of sparse vectors, this method must merge (index, value) pairs into the vector.
- *
- * @param updates a mapping of indices to values to merge in the vector.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- int[] indices = updates.getIndices();
- double[] values = updates.getValues();
- for (int i = 0; i < updates.getNumMappings(); ++i) {
- matrix.setQuick(row + rowStride * indices[i], column + columnStride * indices[i], values[i]);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/MatrixView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixView.java b/math/src/main/java/org/apache/mahout/math/MatrixView.java
deleted file mode 100644
index 951515b..0000000
--- a/math/src/main/java/org/apache/mahout/math/MatrixView.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.flavor.MatrixFlavor;
-
-/** Implements subset view of a Matrix */
-public class MatrixView extends AbstractMatrix {
-
- private Matrix matrix;
-
- // the offset into the Matrix
- private int[] offset;
-
- /**
- * Construct a view of the matrix with given offset and cardinality
- *
- * @param matrix an underlying Matrix
- * @param offset the int[2] offset into the underlying matrix
- * @param size the int[2] size of the view
- */
- public MatrixView(Matrix matrix, int[] offset, int[] size) {
- super(size[ROW], size[COL]);
- int rowOffset = offset[ROW];
- if (rowOffset < 0) {
- throw new IndexException(rowOffset, rowSize());
- }
-
- int rowsRequested = size[ROW];
- if (rowOffset + rowsRequested > matrix.rowSize()) {
- throw new IndexException(rowOffset + rowsRequested, matrix.rowSize());
- }
-
- int columnOffset = offset[COL];
- if (columnOffset < 0) {
- throw new IndexException(columnOffset, columnSize());
- }
-
- int columnsRequested = size[COL];
- if (columnOffset + columnsRequested > matrix.columnSize()) {
- throw new IndexException(columnOffset + columnsRequested, matrix.columnSize());
- }
- this.matrix = matrix;
- this.offset = offset;
- }
-
- @Override
- public Matrix clone() {
- MatrixView clone = (MatrixView) super.clone();
- clone.matrix = matrix.clone();
- clone.offset = offset.clone();
- return clone;
- }
-
- @Override
- public double getQuick(int row, int column) {
- return matrix.getQuick(offset[ROW] + row, offset[COL] + column);
- }
-
- @Override
- public Matrix like() {
- return matrix.like(rowSize(), columnSize());
- }
-
- @Override
- public Matrix like(int rows, int columns) {
- return matrix.like(rows, columns);
- }
-
- @Override
- public void setQuick(int row, int column, double value) {
- matrix.setQuick(offset[ROW] + row, offset[COL] + column, value);
- }
-
- @Override
- public int[] getNumNondefaultElements() {
- return new int[]{rowSize(), columnSize()};
-
- }
-
- @Override
- public Matrix viewPart(int[] offset, int[] size) {
- if (offset[ROW] < 0) {
- throw new IndexException(offset[ROW], 0);
- }
- if (offset[ROW] + size[ROW] > rowSize()) {
- throw new IndexException(offset[ROW] + size[ROW], rowSize());
- }
- if (offset[COL] < 0) {
- throw new IndexException(offset[COL], 0);
- }
- if (offset[COL] + size[COL] > columnSize()) {
- throw new IndexException(offset[COL] + size[COL], columnSize());
- }
- int[] origin = this.offset.clone();
- origin[ROW] += offset[ROW];
- origin[COL] += offset[COL];
- return new MatrixView(matrix, origin, size);
- }
-
- @Override
- public Matrix assignColumn(int column, Vector other) {
- if (rowSize() != other.size()) {
- throw new CardinalityException(rowSize(), other.size());
- }
- for (int row = 0; row < rowSize(); row++) {
- matrix.setQuick(row + offset[ROW], column + offset[COL], other
- .getQuick(row));
- }
- return this;
- }
-
- @Override
- public Matrix assignRow(int row, Vector other) {
- if (columnSize() != other.size()) {
- throw new CardinalityException(columnSize(), other.size());
- }
- for (int col = 0; col < columnSize(); col++) {
- matrix
- .setQuick(row + offset[ROW], col + offset[COL], other.getQuick(col));
- }
- return this;
- }
-
- @Override
- public Vector viewColumn(int column) {
- if (column < 0 || column >= columnSize()) {
- throw new IndexException(column, columnSize());
- }
- return matrix.viewColumn(column + offset[COL]).viewPart(offset[ROW], rowSize());
- }
-
- @Override
- public Vector viewRow(int row) {
- if (row < 0 || row >= rowSize()) {
- throw new IndexException(row, rowSize());
- }
- return matrix.viewRow(row + offset[ROW]).viewPart(offset[COL], columnSize());
- }
-
- @Override
- public MatrixFlavor getFlavor() {
- return matrix.getFlavor();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/MurmurHash.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MurmurHash.java b/math/src/main/java/org/apache/mahout/math/MurmurHash.java
deleted file mode 100644
index 13f3a07..0000000
--- a/math/src/main/java/org/apache/mahout/math/MurmurHash.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.primitives.Ints;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-
-/**
- * This is a very fast, non-cryptographic hash suitable for general hash-based
- * lookup. See http://murmurhash.googlepages.com/ for more details.
- * 
- * The C version of MurmurHash 2.0 found at that site was ported
- * to Java by Andrzej Bialecki (ab at getopt org).
- */
-public final class MurmurHash {
-
- private MurmurHash() {}
-
- /**
- * Hashes an int.
- * @param data The int to hash.
- * @param seed The seed for the hash.
- * @return The 32 bit hash of the bytes in question.
- */
- public static int hash(int data, int seed) {
- return hash(ByteBuffer.wrap(Ints.toByteArray(data)), seed);
- }
-
- /**
- * Hashes bytes in an array.
- * @param data The bytes to hash.
- * @param seed The seed for the hash.
- * @return The 32 bit hash of the bytes in question.
- */
- public static int hash(byte[] data, int seed) {
- return hash(ByteBuffer.wrap(data), seed);
- }
-
- /**
- * Hashes bytes in part of an array.
- * @param data The data to hash.
- * @param offset Where to start munging.
- * @param length How many bytes to process.
- * @param seed The seed to start with.
- * @return The 32-bit hash of the data in question.
- */
- public static int hash(byte[] data, int offset, int length, int seed) {
- return hash(ByteBuffer.wrap(data, offset, length), seed);
- }
-
- /**
- * Hashes the bytes in a buffer from the current position to the limit.
- * @param buf The bytes to hash.
- * @param seed The seed for the hash.
- * @return The 32 bit murmur hash of the bytes in the buffer.
- */
- public static int hash(ByteBuffer buf, int seed) {
- // save byte order for later restoration
- ByteOrder byteOrder = buf.order();
- buf.order(ByteOrder.LITTLE_ENDIAN);
-
- int m = 0x5bd1e995;
- int r = 24;
-
- int h = seed ^ buf.remaining();
-
- while (buf.remaining() >= 4) {
- int k = buf.getInt();
-
- k *= m;
- k ^= k >>> r;
- k *= m;
-
- h *= m;
- h ^= k;
- }
-
- if (buf.remaining() > 0) {
- ByteBuffer finish = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
- // for big-endian version, use this first:
- // finish.position(4-buf.remaining());
- finish.put(buf).rewind();
- h ^= finish.getInt();
- h *= m;
- }
-
- h ^= h >>> 13;
- h *= m;
- h ^= h >>> 15;
-
- buf.order(byteOrder);
- return h;
- }
-
-
- public static long hash64A(byte[] data, int seed) {
- return hash64A(ByteBuffer.wrap(data), seed);
- }
-
- public static long hash64A(byte[] data, int offset, int length, int seed) {
- return hash64A(ByteBuffer.wrap(data, offset, length), seed);
- }
-
- public static long hash64A(ByteBuffer buf, int seed) {
- ByteOrder byteOrder = buf.order();
- buf.order(ByteOrder.LITTLE_ENDIAN);
-
- long m = 0xc6a4a7935bd1e995L;
- int r = 47;
-
- long h = seed ^ (buf.remaining() * m);
-
- while (buf.remaining() >= 8) {
- long k = buf.getLong();
-
- k *= m;
- k ^= k >>> r;
- k *= m;
-
- h ^= k;
- h *= m;
- }
-
- if (buf.remaining() > 0) {
- ByteBuffer finish = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN);
- // for big-endian version, do this first:
- // finish.position(8-buf.remaining());
- finish.put(buf).rewind();
- h ^= finish.getLong();
- h *= m;
- }
-
- h ^= h >>> r;
- h *= m;
- h ^= h >>> r;
-
- buf.order(byteOrder);
- return h;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MurmurHash3.java b/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
deleted file mode 100644
index bd0bb6b..0000000
--- a/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This code is public domain.
- *
- * The MurmurHash3 algorithm was created by Austin Appleby and put into the public domain.
- * See http://code.google.com/p/smhasher/
- *
- * This java port was authored by
- * Yonik Seeley and was placed into the public domain per
- * https://github.com/yonik/java_util/blob/master/src/util/hash/MurmurHash3.java.
- */
-
-package org.apache.mahout.math;
-
-/**
- * 
- * This produces exactly the same hash values as the final C+
- + * version of MurmurHash3 and is thus suitable for producing the same hash values across
- * platforms.
- * 
- * The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids.
- * 
- * Note - The x86 and x64 versions do _not_ produce the same results, as the
- * algorithms are optimized for their respective platforms.
- * 
- * See also http://github.com/yonik/java_util for future updates to this file.
- */
-public final class MurmurHash3 {
-
- private MurmurHash3() {}
-
- /** Returns the MurmurHash3_x86_32 hash. */
- public static int murmurhash3x8632(byte[] data, int offset, int len, int seed) {
-
- int c1 = 0xcc9e2d51;
- int c2 = 0x1b873593;
-
- int h1 = seed;
- int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block
-
- for (int i = offset; i < roundedEnd; i += 4) {
- // little endian load order
- int k1 = (data[i] & 0xff) | ((data[i + 1] & 0xff) << 8) | ((data[i + 2] & 0xff) << 16) | (data[i + 3] << 24);
- k1 *= c1;
- k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
- k1 *= c2;
-
- h1 ^= k1;
- h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
- h1 = h1 * 5 + 0xe6546b64;
- }
-
- // tail
- int k1 = 0;
-
- switch(len & 0x03) {
- case 3:
- k1 = (data[roundedEnd + 2] & 0xff) << 16;
- // fallthrough
- case 2:
- k1 |= (data[roundedEnd + 1] & 0xff) << 8;
- // fallthrough
- case 1:
- k1 |= data[roundedEnd] & 0xff;
- k1 *= c1;
- k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
- k1 *= c2;
- h1 ^= k1;
- default:
- }
-
- // finalization
- h1 ^= len;
-
- // fmix(h1);
- h1 ^= h1 >>> 16;
- h1 *= 0x85ebca6b;
- h1 ^= h1 >>> 13;
- h1 *= 0xc2b2ae35;
- h1 ^= h1 >>> 16;
-
- return h1;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/NamedVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/NamedVector.java b/math/src/main/java/org/apache/mahout/math/NamedVector.java
deleted file mode 100644
index d4fa609..0000000
--- a/math/src/main/java/org/apache/mahout/math/NamedVector.java
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.function.DoubleDoubleFunction;
-import org.apache.mahout.math.function.DoubleFunction;
-
-public class NamedVector implements Vector {
-
- private Vector delegate;
- private String name;
-
- public NamedVector() {
- }
-
- public NamedVector(NamedVector other) {
- this.delegate = other.getDelegate();
- this.name = other.getName();
- }
-
- public NamedVector(Vector delegate, String name) {
- if (delegate == null || name == null) {
- throw new IllegalArgumentException();
- }
- this.delegate = delegate;
- this.name = name;
- }
-
- public String getName() {
- return name;
- }
-
- public Vector getDelegate() {
- return delegate;
- }
-
- @Override
- public int hashCode() {
- return delegate.hashCode();
- }
-
- /**
- * To not break transitivity with other {@link Vector}s, this does not compare name.
- */
- @SuppressWarnings("EqualsWhichDoesntCheckParameterClass")
- @Override
- public boolean equals(Object other) {
- return delegate.equals(other);
- }
-
- @SuppressWarnings("CloneDoesntCallSuperClone")
- @Override
- public NamedVector clone() {
- return new NamedVector(delegate.clone(), name);
- }
-
- @Override
- public Iterable<Element> all() {
- return delegate.all();
- }
-
- @Override
- public Iterable<Element> nonZeroes() {
- return delegate.nonZeroes();
- }
-
- @Override
- public String asFormatString() {
- return toString();
- }
-
- @Override
- public String toString() {
- StringBuilder bldr = new StringBuilder();
- bldr.append(name).append(':').append(delegate.toString());
- return bldr.toString();
- }
-
- @Override
- public Vector assign(double value) {
- return delegate.assign(value);
- }
-
- @Override
- public Vector assign(double[] values) {
- return delegate.assign(values);
- }
-
- @Override
- public Vector assign(Vector other) {
- return delegate.assign(other);
- }
-
- @Override
- public Vector assign(DoubleFunction function) {
- return delegate.assign(function);
- }
-
- @Override
- public Vector assign(Vector other, DoubleDoubleFunction function) {
- return delegate.assign(other, function);
- }
-
- @Override
- public Vector assign(DoubleDoubleFunction f, double y) {
- return delegate.assign(f, y);
- }
-
- @Override
- public int size() {
- return delegate.size();
- }
-
- @Override
- public boolean isDense() {
- return delegate.isDense();
- }
-
- @Override
- public boolean isSequentialAccess() {
- return delegate.isSequentialAccess();
- }
-
- @Override
- public Element getElement(int index) {
- return delegate.getElement(index);
- }
-
- /**
- * Merge a set of (index, value) pairs into the vector.
- *
- * @param updates an ordered mapping of indices to values to be merged in.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- delegate.mergeUpdates(updates);
- }
-
- @Override
- public Vector divide(double x) {
- return delegate.divide(x);
- }
-
- @Override
- public double dot(Vector x) {
- return delegate.dot(x);
- }
-
- @Override
- public double get(int index) {
- return delegate.get(index);
- }
-
- @Override
- public double getQuick(int index) {
- return delegate.getQuick(index);
- }
-
- @Override
- public NamedVector like() {
- return new NamedVector(delegate.like(), name);
- }
-
- @Override
- public Vector like(int cardinality) {
- return new NamedVector(delegate.like(cardinality), name);
- }
-
- @Override
- public Vector minus(Vector x) {
- return delegate.minus(x);
- }
-
- @Override
- public Vector normalize() {
- return delegate.normalize();
- }
-
- @Override
- public Vector normalize(double power) {
- return delegate.normalize(power);
- }
-
- @Override
- public Vector logNormalize() {
- return delegate.logNormalize();
- }
-
- @Override
- public Vector logNormalize(double power) {
- return delegate.logNormalize(power);
- }
-
- @Override
- public double norm(double power) {
- return delegate.norm(power);
- }
-
- @Override
- public double maxValue() {
- return delegate.maxValue();
- }
-
- @Override
- public int maxValueIndex() {
- return delegate.maxValueIndex();
- }
-
- @Override
- public double minValue() {
- return delegate.minValue();
- }
-
- @Override
- public int minValueIndex() {
- return delegate.minValueIndex();
- }
-
- @Override
- public Vector plus(double x) {
- return delegate.plus(x);
- }
-
- @Override
- public Vector plus(Vector x) {
- return delegate.plus(x);
- }
-
- @Override
- public void set(int index, double value) {
- delegate.set(index, value);
- }
-
- @Override
- public void setQuick(int index, double value) {
- delegate.setQuick(index, value);
- }
-
- @Override
- public void incrementQuick(int index, double increment) {
- delegate.incrementQuick(index, increment);
- }
-
- @Override
- public int getNumNonZeroElements() {
- return delegate.getNumNonZeroElements();
- }
-
- @Override
- public int getNumNondefaultElements() {
- return delegate.getNumNondefaultElements();
- }
-
- @Override
- public Vector times(double x) {
- return delegate.times(x);
- }
-
- @Override
- public Vector times(Vector x) {
- return delegate.times(x);
- }
-
- @Override
- public Vector viewPart(int offset, int length) {
- return delegate.viewPart(offset, length);
- }
-
- @Override
- public double zSum() {
- return delegate.zSum();
- }
-
- @Override
- public Matrix cross(Vector other) {
- return delegate.cross(other);
- }
-
- @Override
- public double aggregate(DoubleDoubleFunction aggregator, DoubleFunction map) {
- return delegate.aggregate(aggregator, map);
- }
-
- @Override
- public double aggregate(Vector other, DoubleDoubleFunction aggregator, DoubleDoubleFunction combiner) {
- return delegate.aggregate(other, aggregator, combiner);
- }
-
- @Override
- public double getLengthSquared() {
- return delegate.getLengthSquared();
- }
-
- @Override
- public double getDistanceSquared(Vector v) {
- return delegate.getDistanceSquared(v);
- }
-
- @Override
- public double getLookupCost() {
- return delegate.getLookupCost();
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return delegate.getIteratorAdvanceCost();
- }
-
- @Override
- public boolean isAddConstantTime() {
- return delegate.isAddConstantTime();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/OldQRDecomposition.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/OldQRDecomposition.java b/math/src/main/java/org/apache/mahout/math/OldQRDecomposition.java
deleted file mode 100644
index e1552e4..0000000
--- a/math/src/main/java/org/apache/mahout/math/OldQRDecomposition.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Copyright 1999 CERN - European Organization for Nuclear Research.
- * Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
- * is hereby granted without fee, provided that the above copyright notice appear in all copies and
- * that both that copyright notice and this permission notice appear in supporting documentation.
- * CERN makes no representations about the suitability of this software for any purpose.
- * It is provided "as is" without expressed or implied warranty.
- */
-package org.apache.mahout.math;
-
-import org.apache.mahout.math.function.Functions;
-
-import java.util.Locale;
-
-
-/**
- For an <tt>m x n</tt> matrix <tt>A</tt> with <tt>m >= n</tt>, the QR decomposition is an <tt>m x n</tt>
- orthogonal matrix <tt>Q</tt> and an <tt>n x n</tt> upper triangular matrix <tt>R</tt> so that
- <tt>A = Q*R</tt>.
- 
- The QR decompostion always exists, even if the matrix does not have
- full rank, so the constructor will never fail. The primary use of the
- QR decomposition is in the least squares solution of nonsquare systems
- of simultaneous linear equations. This will fail if <tt>isFullRank()</tt>
- returns <tt>false</tt>.
- */
-
-/** partially deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */
-public class OldQRDecomposition implements QR {
-
- /** Array for internal storage of decomposition. */
- private final Matrix qr;
-
- /** Row and column dimensions. */
- private final int originalRows;
- private final int originalColumns;
-
- /** Array for internal storage of diagonal of R. */
- private final Vector rDiag;
-
- /**
- * Constructs and returns a new QR decomposition object; computed by Householder reflections; The decomposed matrices
- * can be retrieved via instance methods of the returned decomposition object.
- *
- * @param a A rectangular matrix.
- * @throws IllegalArgumentException if {@code A.rows() < A.columns()}
- */
-
- public OldQRDecomposition(Matrix a) {
-
- // Initialize.
- qr = a.clone();
- originalRows = a.numRows();
- originalColumns = a.numCols();
- rDiag = new DenseVector(originalColumns);
-
- // precompute and cache some views to avoid regenerating them time and again
- Vector[] QRcolumnsPart = new Vector[originalColumns];
- for (int k = 0; k < originalColumns; k++) {
- QRcolumnsPart[k] = qr.viewColumn(k).viewPart(k, originalRows - k);
- }
-
- // Main loop.
- for (int k = 0; k < originalColumns; k++) {
- //DoubleMatrix1D QRcolk = QR.viewColumn(k).viewPart(k,m-k);
- // Compute 2-norm of k-th column without under/overflow.
- double nrm = 0;
- //if (k<m) nrm = QRcolumnsPart[k].aggregate(hypot,F.identity);
-
- for (int i = k; i < originalRows; i++) { // fixes bug reported by ***@osu.edu
- nrm = Algebra.hypot(nrm, qr.getQuick(i, k));
- }
-
-
- if (nrm != 0.0) {
- // Form k-th Householder vector.
- if (qr.getQuick(k, k) < 0) {
- nrm = -nrm;
- }
- QRcolumnsPart[k].assign(Functions.div(nrm));
- /*
- for (int i = k; i < m; i++) {
- QR[i][k] /= nrm;
- }
- */
-
- qr.setQuick(k, k, qr.getQuick(k, k) + 1);
-
- // Apply transformation to remaining columns.
- for (int j = k + 1; j < originalColumns; j++) {
- Vector QRcolj = qr.viewColumn(j).viewPart(k, originalRows - k);
- double s = QRcolumnsPart[k].dot(QRcolj);
- /*
- // fixes bug reported by John Chambers
- DoubleMatrix1D QRcolj = QR.viewColumn(j).viewPart(k,m-k);
- double s = QRcolumnsPart[k].zDotProduct(QRcolumns[j]);
- double s = 0.0;
- for (int i = k; i < m; i++) {
- s += QR[i][k]*QR[i][j];
- }
- */
- s = -s / qr.getQuick(k, k);
- //QRcolumnsPart[j].assign(QRcolumns[k], F.plusMult(s));
-
- for (int i = k; i < originalRows; i++) {
- qr.setQuick(i, j, qr.getQuick(i, j) + s * qr.getQuick(i, k));
- }
-
- }
- }
- rDiag.setQuick(k, -nrm);
- }
- }
-
- /**
- * Generates and returns the (economy-sized) orthogonal factor <tt>Q</tt>.
- *
- * @return <tt>Q</tt>
- */
- @Override
- public Matrix getQ() {
- int columns = Math.min(originalColumns, originalRows);
- Matrix q = qr.like(originalRows, columns);
- for (int k = columns - 1; k >= 0; k--) {
- Vector QRcolk = qr.viewColumn(k).viewPart(k, originalRows - k);
- q.set(k, k, 1);
- for (int j = k; j < columns; j++) {
- if (qr.get(k, k) != 0) {
- Vector Qcolj = q.viewColumn(j).viewPart(k, originalRows - k);
- double s = -QRcolk.dot(Qcolj) / qr.get(k, k);
- Qcolj.assign(QRcolk, Functions.plusMult(s));
- }
- }
- }
- return q;
- }
-
- /**
- * Returns the upper triangular factor, <tt>R</tt>.
- *
- * @return <tt>R</tt>
- */
- @Override
- public Matrix getR() {
- int rows = Math.min(originalRows, originalColumns);
- Matrix r = qr.like(rows, originalColumns);
- for (int i = 0; i < rows; i++) {
- for (int j = 0; j < originalColumns; j++) {
- if (i < j) {
- r.setQuick(i, j, qr.getQuick(i, j));
- } else if (i == j) {
- r.setQuick(i, j, rDiag.getQuick(i));
- } else {
- r.setQuick(i, j, 0);
- }
- }
- }
- return r;
- }
-
- /**
- * Returns whether the matrix <tt>A</tt> has full rank.
- *
- * @return true if <tt>R</tt>, and hence <tt>A</tt>, has full rank.
- */
- @Override
- public boolean hasFullRank() {
- for (int j = 0; j < originalColumns; j++) {
- if (rDiag.getQuick(j) == 0) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Least squares solution of <tt>A*X = B</tt>; <tt>returns X</tt>.
- *
- * @param B A matrix with as many rows as <tt>A</tt> and any number of columns.
- * @return <tt>X</tt> that minimizes the two norm of <tt>Q*R*X - B</tt>.
- * @throws IllegalArgumentException if <tt>B.rows() != A.rows()</tt>.
- */
- @Override
- public Matrix solve(Matrix B) {
- if (B.numRows() != originalRows) {
- throw new IllegalArgumentException("Matrix row dimensions must agree.");
- }
-
- int columns = B.numCols();
- Matrix x = B.like(originalColumns, columns);
-
- // this can all be done a bit more efficiently if we don't actually
- // form explicit versions of Q^T and R but this code isn't soo bad
- // and it is much easier to understand
- Matrix qt = getQ().transpose();
- Matrix y = qt.times(B);
-
- Matrix r = getR();
- for (int k = Math.min(originalColumns, originalRows) - 1; k >= 0; k--) {
- // X[k,] = Y[k,] / R[k,k], note that X[k,] starts with 0 so += is same as =
- x.viewRow(k).assign(y.viewRow(k), Functions.plusMult(1 / r.get(k, k)));
-
- // Y[0:(k-1),] -= R[0:(k-1),k] * X[k,]
- Vector rColumn = r.viewColumn(k).viewPart(0, k);
- for (int c = 0; c < columns; c++) {
- y.viewColumn(c).viewPart(0, k).assign(rColumn, Functions.plusMult(-x.get(k, c)));
- }
- }
- return x;
- }
-
- /**
- * Returns a rough string rendition of a QR.
- */
- @Override
- public String toString() {
- return String.format(Locale.ENGLISH, "QR(%d,%d,fullRank=%s)", originalColumns, originalRows, hasFullRank());
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java b/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java
deleted file mode 100644
index 7c6ad11..0000000
--- a/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.Serializable;
-
-public final class OrderedIntDoubleMapping implements Serializable, Cloneable {
-
- static final double DEFAULT_VALUE = 0.0;
-
- private int[] indices;
- private double[] values;
- private int numMappings;
-
- // If true, doesn't allow DEFAULT_VALUEs in the mapping (adding a zero discards it). Otherwise, a DEFAULT_VALUE is
- // treated like any other value.
- private boolean noDefault = true;
-
- OrderedIntDoubleMapping(boolean noDefault) {
- this();
- this.noDefault = noDefault;
- }
-
- OrderedIntDoubleMapping() {
- // no-arg constructor for deserializer
- this(11);
- }
-
- OrderedIntDoubleMapping(int capacity) {
- indices = new int[capacity];
- values = new double[capacity];
- numMappings = 0;
- }
-
- OrderedIntDoubleMapping(int[] indices, double[] values, int numMappings) {
- this.indices = indices;
- this.values = values;
- this.numMappings = numMappings;
- }
-
- public int[] getIndices() {
- return indices;
- }
-
- public int indexAt(int offset) {
- return indices[offset];
- }
-
- public void setIndexAt(int offset, int index) {
- indices[offset] = index;
- }
-
- public double[] getValues() {
- return values;
- }
-
- public void setValueAt(int offset, double value) {
- values[offset] = value;
- }
-
-
- public int getNumMappings() {
- return numMappings;
- }
-
- private void growTo(int newCapacity) {
- if (newCapacity > indices.length) {
- int[] newIndices = new int[newCapacity];
- System.arraycopy(indices, 0, newIndices, 0, numMappings);
- indices = newIndices;
- double[] newValues = new double[newCapacity];
- System.arraycopy(values, 0, newValues, 0, numMappings);
- values = newValues;
- }
- }
-
- private int find(int index) {
- int low = 0;
- int high = numMappings - 1;
- while (low <= high) {
- int mid = low + (high - low >>> 1);
- int midVal = indices[mid];
- if (midVal < index) {
- low = mid + 1;
- } else if (midVal > index) {
- high = mid - 1;
- } else {
- return mid;
- }
- }
- return -(low + 1);
- }
-
- public double get(int index) {
- int offset = find(index);
- return offset >= 0 ? values[offset] : DEFAULT_VALUE;
- }
-
- public void set(int index, double value) {
- if (numMappings == 0 || index > indices[numMappings - 1]) {
- if (!noDefault || value != DEFAULT_VALUE) {
- if (numMappings >= indices.length) {
- growTo(Math.max((int) (1.2 * numMappings), numMappings + 1));
- }
- indices[numMappings] = index;
- values[numMappings] = value;
- ++numMappings;
- }
- } else {
- int offset = find(index);
- if (offset >= 0) {
- insertOrUpdateValueIfPresent(offset, value);
- } else {
- insertValueIfNotDefault(index, offset, value);
- }
- }
- }
-
- /**
- * Merges the updates in linear time by allocating new arrays and iterating through the existing indices and values
- * and the updates' indices and values at the same time while selecting the minimum index to set at each step.
- * @param updates another list of mappings to be merged in.
- */
- public void merge(OrderedIntDoubleMapping updates) {
- int[] updateIndices = updates.getIndices();
- double[] updateValues = updates.getValues();
-
- int newNumMappings = numMappings + updates.getNumMappings();
- int newCapacity = Math.max((int) (1.2 * newNumMappings), newNumMappings + 1);
- int[] newIndices = new int[newCapacity];
- double[] newValues = new double[newCapacity];
-
- int k = 0;
- int i = 0, j = 0;
- for (; i < numMappings && j < updates.getNumMappings(); ++k) {
- if (indices[i] < updateIndices[j]) {
- newIndices[k] = indices[i];
- newValues[k] = values[i];
- ++i;
- } else if (indices[i] > updateIndices[j]) {
- newIndices[k] = updateIndices[j];
- newValues[k] = updateValues[j];
- ++j;
- } else {
- newIndices[k] = updateIndices[j];
- newValues[k] = updateValues[j];
- ++i;
- ++j;
- }
- }
-
- for (; i < numMappings; ++i, ++k) {
- newIndices[k] = indices[i];
- newValues[k] = values[i];
- }
- for (; j < updates.getNumMappings(); ++j, ++k) {
- newIndices[k] = updateIndices[j];
- newValues[k] = updateValues[j];
- }
-
- indices = newIndices;
- values = newValues;
- numMappings = k;
- }
-
- @Override
- public int hashCode() {
- int result = 0;
- for (int i = 0; i < numMappings; i++) {
- result = 31 * result + indices[i];
- result = 31 * result + (int) Double.doubleToRawLongBits(values[i]);
- }
- return result;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof OrderedIntDoubleMapping) {
- OrderedIntDoubleMapping other = (OrderedIntDoubleMapping) o;
- if (numMappings == other.numMappings) {
- for (int i = 0; i < numMappings; i++) {
- if (indices[i] != other.indices[i] || values[i] != other.values[i]) {
- return false;
- }
- }
- return true;
- }
- }
- return false;
- }
-
- @Override
- public String toString() {
- StringBuilder result = new StringBuilder(10 * numMappings);
- for (int i = 0; i < numMappings; i++) {
- result.append('(');
- result.append(indices[i]);
- result.append(',');
- result.append(values[i]);
- result.append(')');
- }
- return result.toString();
- }
-
- @SuppressWarnings("CloneDoesntCallSuperClone")
- @Override
- public OrderedIntDoubleMapping clone() {
- return new OrderedIntDoubleMapping(indices.clone(), values.clone(), numMappings);
- }
-
- public void increment(int index, double increment) {
- int offset = find(index);
- if (offset >= 0) {
- double newValue = values[offset] + increment;
- insertOrUpdateValueIfPresent(offset, newValue);
- } else {
- insertValueIfNotDefault(index, offset, increment);
- }
- }
-
- private void insertValueIfNotDefault(int index, int offset, double value) {
- if (!noDefault || value != DEFAULT_VALUE) {
- if (numMappings >= indices.length) {
- growTo(Math.max((int) (1.2 * numMappings), numMappings + 1));
- }
- int at = -offset - 1;
- if (numMappings > at) {
- for (int i = numMappings - 1, j = numMappings; i >= at; i--, j--) {
- indices[j] = indices[i];
- values[j] = values[i];
- }
- }
- indices[at] = index;
- values[at] = value;
- numMappings++;
- }
- }
-
- private void insertOrUpdateValueIfPresent(int offset, double newValue) {
- if (noDefault && newValue == DEFAULT_VALUE) {
- for (int i = offset + 1, j = offset; i < numMappings; i++, j++) {
- indices[j] = indices[i];
- values[j] = values[i];
- }
- numMappings--;
- } else {
- values[offset] = newValue;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/OrthonormalityVerifier.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/OrthonormalityVerifier.java b/math/src/main/java/org/apache/mahout/math/OrthonormalityVerifier.java
deleted file mode 100644
index e8dd2b1..0000000
--- a/math/src/main/java/org/apache/mahout/math/OrthonormalityVerifier.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.collect.Lists;
-
-import java.util.List;
-
-public final class OrthonormalityVerifier {
-
- private OrthonormalityVerifier() {
- }
-
- public static VectorIterable pairwiseInnerProducts(Iterable<MatrixSlice> basis) {
- DenseMatrix out = null;
- for (MatrixSlice slice1 : basis) {
- List<Double> dots = Lists.newArrayList();
- for (MatrixSlice slice2 : basis) {
- dots.add(slice1.vector().dot(slice2.vector()));
- }
- if (out == null) {
- out = new DenseMatrix(dots.size(), dots.size());
- }
- for (int i = 0; i < dots.size(); i++) {
- out.set(slice1.index(), i, dots.get(i));
- }
- }
- return out;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java b/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
deleted file mode 100644
index e46f326..0000000
--- a/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.util.Iterator;
-
-import com.google.common.collect.AbstractIterator;
-
-/**
- * Provides a permuted view of a vector.
- */
-public class PermutedVectorView extends AbstractVector {
- private final Vector vector; // the vector containing the data
- private final int[] pivot; // convert from external index to internal
- private final int[] unpivot; // convert from internal index to external
-
- public PermutedVectorView(Vector vector, int[] pivot, int[] unpivot) {
- super(vector.size());
- this.vector = vector;
- this.pivot = pivot;
- this.unpivot = unpivot;
- }
-
- public PermutedVectorView(Vector vector, int[] pivot) {
- this(vector, pivot, reversePivotPermutation(pivot));
- }
-
- private static int[] reversePivotPermutation(int[] pivot) {
- int[] unpivot1 = new int[pivot.length];
- for (int i = 0; i < pivot.length; i++) {
- unpivot1[pivot[i]] = i;
- }
- return unpivot1;
- }
-
- /**
- * Subclasses must override to return an appropriately sparse or dense result
- *
- * @param rows the row cardinality
- * @param columns the column cardinality
- * @return a Matrix
- */
- @Override
- protected Matrix matrixLike(int rows, int columns) {
- if (vector.isDense()) {
- return new DenseMatrix(rows, columns);
- } else {
- return new SparseRowMatrix(rows, columns);
- }
- }
-
- /**
- * Used internally by assign() to update multiple indices and values at once.
- * Only really useful for sparse vectors (especially SequentialAccessSparseVector).
- * 
- * If someone ever adds a new type of sparse vectors, this method must merge (index, value) pairs into the vector.
- *
- * @param updates a mapping of indices to values to merge in the vector.
- */
- @Override
- public void mergeUpdates(OrderedIntDoubleMapping updates) {
- for (int i = 0; i < updates.getNumMappings(); ++i) {
- updates.setIndexAt(i, pivot[updates.indexAt(i)]);
- }
- vector.mergeUpdates(updates);
- }
-
- /**
- * @return true iff this implementation should be considered dense -- that it explicitly
- * represents every value
- */
- @Override
- public boolean isDense() {
- return vector.isDense();
- }
-
- /**
- * If the view is permuted, the elements cannot be accessed in the same order.
- *
- * @return true iff this implementation should be considered to be iterable in index order in an
- * efficient way. In particular this implies that {@link #iterator()} and {@link
- * #iterateNonZero()} return elements in ascending order by index.
- */
- @Override
- public boolean isSequentialAccess() {
- return false;
- }
-
- /**
- * Iterates over all elements * NOTE: Implementations may choose to reuse the Element
- * returned for performance reasons, so if you need a copy of it, you should call {@link
- * #getElement(int)} for the given index
- *
- * @return An {@link java.util.Iterator} over all elements
- */
- @Override
- public Iterator<Element> iterator() {
- return new AbstractIterator<Element>() {
- private final Iterator<Element> i = vector.all().iterator();
-
- @Override
- protected Vector.Element computeNext() {
- if (i.hasNext()) {
- final Element x = i.next();
- return new Element() {
- private final int index = unpivot[x.index()];
-
- @Override
- public double get() {
- return x.get();
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set(double value) {
- x.set(value);
- }
- };
- } else {
- return endOfData();
- }
- }
- };
- }
-
- /**
- * Iterates over all non-zero elements. NOTE: Implementations may choose to reuse the Element
- * returned for performance reasons, so if you need a copy of it, you should call {@link
- * #getElement(int)} for the given index
- *
- * @return An {@link java.util.Iterator} over all non-zero elements
- */
- @Override
- public Iterator<Element> iterateNonZero() {
- return new AbstractIterator<Element>() {
- private final Iterator<Element> i = vector.nonZeroes().iterator();
-
- @Override
- protected Vector.Element computeNext() {
- if (i.hasNext()) {
- final Element x = i.next();
- return new Element() {
- private final int index = unpivot[x.index()];
-
- @Override
- public double get() {
- return x.get();
- }
-
- @Override
- public int index() {
- return index;
- }
-
- @Override
- public void set(double value) {
- x.set(value);
- }
- };
- } else {
- return endOfData();
- }
- }
- };
- }
-
- /**
- * Return the value at the given index, without checking bounds
- *
- * @param index an int index
- * @return the double at the index
- */
- @Override
- public double getQuick(int index) {
- return vector.getQuick(pivot[index]);
- }
-
- /**
- * Return an empty vector of the same underlying class as the receiver
- *
- * @return a Vector
- */
- @Override
- public Vector like() {
- return vector.like();
- }
-
- @Override
- public Vector like(int cardinality) {
- return vector.like(cardinality);
- }
-
- /**
- * Set the value at the given index, without checking bounds
- *
- * @param index an int index into the receiver
- * @param value a double value to set
- */
- @Override
- public void setQuick(int index, double value) {
- vector.setQuick(pivot[index], value);
- }
-
- /** Return the number of values in the recipient */
- @Override
- public int getNumNondefaultElements() {
- return vector.getNumNondefaultElements();
- }
-
- @Override
- public int getNumNonZeroElements() {
- // Return the number of nonzeros in the recipient,
- // so potentially don't have to go through our iterator
- return vector.getNumNonZeroElements();
- }
-
- @Override
- public double getLookupCost() {
- return vector.getLookupCost();
- }
-
- @Override
- public double getIteratorAdvanceCost() {
- return vector.getIteratorAdvanceCost();
- }
-
- @Override
- public boolean isAddConstantTime() {
- return vector.isAddConstantTime();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math/src/main/java/org/apache/mahout/math/PersistentObject.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/PersistentObject.java b/math/src/main/java/org/apache/mahout/math/PersistentObject.java
deleted file mode 100644
index f1d4293..0000000
--- a/math/src/main/java/org/apache/mahout/math/PersistentObject.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
-Copyright 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
-is hereby granted without fee, provided that the above copyright notice appear in all copies and
-that both that copyright notice and this permission notice appear in supporting documentation.
-CERN makes no representations about the suitability of this software for any purpose.
-It is provided "as is" without expressed or implied warranty.
-*/
-package org.apache.mahout.math;
-
-/**
- * This empty class is the common root for all persistent capable classes.
- * If this class inherits from <tt>java.lang.Object</tt> then all subclasses are serializable with
- * the standard Java serialization mechanism.
- * If this class inherits from <tt>com.objy.db.app.ooObj</tt> then all subclasses are
- * additionally serializable with the Objectivity ODBMS persistance mechanism.
- * Thus, by modifying the inheritance of this class the entire tree of subclasses can
- * be switched to Objectivity compatibility (and back) with minimum effort.
- */
-public abstract class PersistentObject implements java.io.Serializable, Cloneable {
-
- /** Not yet commented. */
- protected PersistentObject() {
- }
-
- /**
- * Returns a copy of the receiver. This default implementation does not nothing except making the otherwise
- * <tt>protected</tt> clone method <tt>public</tt>.
- *
- * @return a copy of the receiver.
- */
- @Override
- public Object clone() {
- try {
- return super.clone();
- } catch (CloneNotSupportedException exc) {
- throw new InternalError(); //should never happen since we are cloneable
- }
- }
-}

r***@apache.org

2018-06-27 14:52:00 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java
deleted file mode 100644
index b311a5e..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-
-import javax.sql.DataSource;
-
-public class SQL92JDBCInMemoryItemSimilarity extends AbstractJDBCInMemoryItemSimilarity {
-
- static final String DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL =
- "SELECT " + AbstractJDBCItemSimilarity.DEFAULT_ITEM_A_ID_COLUMN + ", "
- + AbstractJDBCItemSimilarity.DEFAULT_ITEM_B_ID_COLUMN + ", "
- + AbstractJDBCItemSimilarity.DEFAULT_SIMILARITY_COLUMN + " FROM "
- + AbstractJDBCItemSimilarity.DEFAULT_SIMILARITY_TABLE;
-
-
- public SQL92JDBCInMemoryItemSimilarity() throws TasteException {
- this(AbstractJDBCComponent.lookupDataSource(AbstractJDBCComponent.DEFAULT_DATASOURCE_NAME),
- DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL);
- }
-
- public SQL92JDBCInMemoryItemSimilarity(String dataSourceName) throws TasteException {
- this(AbstractJDBCComponent.lookupDataSource(dataSourceName), DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL);
- }
-
- public SQL92JDBCInMemoryItemSimilarity(DataSource dataSource) {
- this(dataSource, DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL);
- }
-
- public SQL92JDBCInMemoryItemSimilarity(DataSource dataSource, String getAllItemSimilaritiesSQL) {
- super(dataSource, getAllItemSimilaritiesSQL);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java
deleted file mode 100644
index f449561..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-
-import javax.sql.DataSource;
-
-public class SQL92JDBCItemSimilarity extends AbstractJDBCItemSimilarity {
-
- public SQL92JDBCItemSimilarity() throws TasteException {
- this(DEFAULT_DATASOURCE_NAME);
- }
-
- public SQL92JDBCItemSimilarity(String dataSourceName) throws TasteException {
- this(lookupDataSource(dataSourceName));
- }
-
- public SQL92JDBCItemSimilarity(DataSource dataSource) {
- this(dataSource,
- DEFAULT_SIMILARITY_TABLE,
- DEFAULT_ITEM_A_ID_COLUMN,
- DEFAULT_ITEM_B_ID_COLUMN,
- DEFAULT_SIMILARITY_COLUMN);
- }
-
- public SQL92JDBCItemSimilarity(DataSource dataSource,
- String similarityTable,
- String itemAIDColumn,
- String itemBIDColumn,
- String similarityColumn) {
- super(dataSource,
- similarityTable,
- itemAIDColumn,
- itemBIDColumn, similarityColumn,
- "SELECT " + similarityColumn + " FROM " + similarityTable + " WHERE "
- + itemAIDColumn + "=? AND " + itemBIDColumn + "=?",
- "SELECT " + itemAIDColumn + ", " + itemBIDColumn + " FROM " + similarityTable + " WHERE "
- + itemAIDColumn + "=? OR " + itemBIDColumn + "=?");
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java b/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java
deleted file mode 100644
index a5a89c6..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java
+++ /dev/null
@@ -1,215 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.web;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-import javax.servlet.ServletConfig;
-import javax.servlet.ServletException;
-import javax.servlet.http.HttpServlet;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.List;
-
-/**
- * A servlet which returns recommendations, as its name implies. The servlet accepts GET and POST
- * HTTP requests, and looks for two parameters:
- *
- * <ul>
- * <li>userID: the user ID for which to produce recommendations</li>
- * <li>howMany: the number of recommendations to produce</li>
- * <li>debug: (optional) output a lot of information that is useful in debugging.
- * Defaults to false, of course.</li>
- * </ul>
- *
- * The response is text, and contains a list of the IDs of recommended items, in descending
- * order of relevance, one per line.
- *
- * For example, you can get 10 recommendations for user 123 from the following URL (assuming
- * you are running taste in a web application running locally on port 8080): 
- * {@code http://localhost:8080/taste/RecommenderServlet?userID=123&howMany=10}
- *
- * This servlet requires one {@code init-param} in {@code web.xml}: it must find
- * a parameter named "recommender-class" which is the name of a class that implements
- * {@link Recommender} and has a no-arg constructor. The servlet will instantiate and use
- * this {@link Recommender} to produce recommendations.
- */
-public final class RecommenderServlet extends HttpServlet {
-
- private static final int NUM_TOP_PREFERENCES = 20;
- private static final int DEFAULT_HOW_MANY = 20;
-
- private Recommender recommender;
-
- @Override
- public void init(ServletConfig config) throws ServletException {
- super.init(config);
- String recommenderClassName = config.getInitParameter("recommender-class");
- if (recommenderClassName == null) {
- throw new ServletException("Servlet init-param \"recommender-class\" is not defined");
- }
- RecommenderSingleton.initializeIfNeeded(recommenderClassName);
- recommender = RecommenderSingleton.getInstance().getRecommender();
- }
-
- @Override
- public void doGet(HttpServletRequest request,
- HttpServletResponse response) throws ServletException {
-
- String userIDString = request.getParameter("userID");
- if (userIDString == null) {
- throw new ServletException("userID was not specified");
- }
- long userID = Long.parseLong(userIDString);
- String howManyString = request.getParameter("howMany");
- int howMany = howManyString == null ? DEFAULT_HOW_MANY : Integer.parseInt(howManyString);
- boolean debug = Boolean.parseBoolean(request.getParameter("debug"));
- String format = request.getParameter("format");
- if (format == null) {
- format = "text";
- }
-
- try {
- List<RecommendedItem> items = recommender.recommend(userID, howMany);
- if ("text".equals(format)) {
- writePlainText(response, userID, debug, items);
- } else if ("xml".equals(format)) {
- writeXML(response, items);
- } else if ("json".equals(format)) {
- writeJSON(response, items);
- } else {
- throw new ServletException("Bad format parameter: " + format);
- }
- } catch (TasteException | IOException te) {
- throw new ServletException(te);
- }
-
- }
-
- private static void writeXML(HttpServletResponse response, Iterable<RecommendedItem> items) throws IOException {
- response.setContentType("application/xml");
- response.setCharacterEncoding("UTF-8");
- response.setHeader("Cache-Control", "no-cache");
- PrintWriter writer = response.getWriter();
- writer.print("<?xml version=\"1.0\" encoding=\"UTF-8\"?><recommendedItems>");
- for (RecommendedItem recommendedItem : items) {
- writer.print("<item><value>");
- writer.print(recommendedItem.getValue());
- writer.print("</value><id>");
- writer.print(recommendedItem.getItemID());
- writer.print("</id></item>");
- }
- writer.println("</recommendedItems>");
- }
-
- private static void writeJSON(HttpServletResponse response, Iterable<RecommendedItem> items) throws IOException {
- response.setContentType("application/json");
- response.setCharacterEncoding("UTF-8");
- response.setHeader("Cache-Control", "no-cache");
- PrintWriter writer = response.getWriter();
- writer.print("{\"recommendedItems\":{\"item\":[");
- boolean first = true;
- for (RecommendedItem recommendedItem : items) {
- if (first) {
- first = false;
- } else {
- writer.print(',');
- }
- writer.print("{\"value\":\"");
- writer.print(recommendedItem.getValue());
- writer.print("\",\"id\":\"");
- writer.print(recommendedItem.getItemID());
- writer.print("\"}");
- }
- writer.println("]}}");
- }
-
- private void writePlainText(HttpServletResponse response,
- long userID,
- boolean debug,
- Iterable<RecommendedItem> items) throws IOException, TasteException {
- response.setContentType("text/plain");
- response.setCharacterEncoding("UTF-8");
- response.setHeader("Cache-Control", "no-cache");
- PrintWriter writer = response.getWriter();
- if (debug) {
- writeDebugRecommendations(userID, items, writer);
- } else {
- writeRecommendations(items, writer);
- }
- }
-
- private static void writeRecommendations(Iterable<RecommendedItem> items, PrintWriter writer) {
- for (RecommendedItem recommendedItem : items) {
- writer.print(recommendedItem.getValue());
- writer.print('\t');
- writer.println(recommendedItem.getItemID());
- }
- }
-
- private void writeDebugRecommendations(long userID, Iterable<RecommendedItem> items, PrintWriter writer)
- throws TasteException {
- DataModel dataModel = recommender.getDataModel();
- writer.print("User:");
- writer.println(userID);
- writer.print("Recommender: ");
- writer.println(recommender);
- writer.println();
- writer.print("Top ");
- writer.print(NUM_TOP_PREFERENCES);
- writer.println(" Preferences:");
- PreferenceArray rawPrefs = dataModel.getPreferencesFromUser(userID);
- int length = rawPrefs.length();
- PreferenceArray sortedPrefs = rawPrefs.clone();
- sortedPrefs.sortByValueReversed();
- // Cap this at NUM_TOP_PREFERENCES just to be brief
- int max = Math.min(NUM_TOP_PREFERENCES, length);
- for (int i = 0; i < max; i++) {
- Preference pref = sortedPrefs.get(i);
- writer.print(pref.getValue());
- writer.print('\t');
- writer.println(pref.getItemID());
- }
- writer.println();
- writer.println("Recommendations:");
- for (RecommendedItem recommendedItem : items) {
- writer.print(recommendedItem.getValue());
- writer.print('\t');
- writer.println(recommendedItem.getItemID());
- }
- }
-
- @Override
- public void doPost(HttpServletRequest request,
- HttpServletResponse response) throws ServletException {
- doGet(request, response);
- }
-
- @Override
- public String toString() {
- return "RecommenderServlet[recommender:" + recommender + ']';
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java b/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java
deleted file mode 100644
index 265d7c0..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.web;
-
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.common.ClassUtils;
-
-/**
- * A singleton which holds an instance of a {@link Recommender}. This is used to share
- * a {@link Recommender} between {@link RecommenderServlet} and {@code RecommenderService.jws}.
- */
-public final class RecommenderSingleton {
-
- private final Recommender recommender;
-
- private static RecommenderSingleton instance;
-
- public static synchronized RecommenderSingleton getInstance() {
- if (instance == null) {
- throw new IllegalStateException("Not initialized");
- }
- return instance;
- }
-
- public static synchronized void initializeIfNeeded(String recommenderClassName) {
- if (instance == null) {
- instance = new RecommenderSingleton(recommenderClassName);
- }
- }
-
- private RecommenderSingleton(String recommenderClassName) {
- if (recommenderClassName == null) {
- throw new IllegalArgumentException("Recommender class name is null");
- }
- recommender = ClassUtils.instantiateAs(recommenderClassName, Recommender.class);
- }
-
- public Recommender getRecommender() {
- return recommender;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java b/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java
deleted file mode 100644
index e927098..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.web;
-
-import com.google.common.io.Files;
-import com.google.common.io.InputSupplier;
-import com.google.common.io.Resources;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * Users of the packaging and deployment mechanism in this module need
- * to produce a {@link Recommender} implementation with a no-arg constructor,
- * which will internally build the desired {@link Recommender} and delegate
- * to it. This wrapper simplifies that process. Simply extend this class and
- * implement {@link #buildRecommender()}.
- */
-public abstract class RecommenderWrapper implements Recommender {
-
- private static final Logger log = LoggerFactory.getLogger(RecommenderWrapper.class);
-
- private final Recommender delegate;
-
- protected RecommenderWrapper() throws TasteException, IOException {
- this.delegate = buildRecommender();
- }
-
- /**
- * @return the {@link Recommender} which should be used to produce recommendations
- * by this wrapper implementation
- */
- protected abstract Recommender buildRecommender() throws IOException, TasteException;
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
- return delegate.recommend(userID, howMany);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
- return delegate.recommend(userID, howMany, rescorer);
- }
-
- @Override
- public float estimatePreference(long userID, long itemID) throws TasteException {
- return delegate.estimatePreference(userID, itemID);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- delegate.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- delegate.removePreference(userID, itemID);
- }
-
- @Override
- public DataModel getDataModel() {
- return delegate.getDataModel();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- delegate.refresh(alreadyRefreshed);
- }
-
- /**
- * Reads the given resource into a temporary file. This is intended to be used
- * to read data files which are stored as a resource available on the classpath,
- * such as in a JAR file. However for convenience the resource name will also
- * be interpreted as a relative path to a local file, if no such resource is
- * found. This facilitates testing.
- *
- * @param resourceName name of resource in classpath, or relative path to file
- * @return temporary {@link File} with resource data
- * @throws IOException if an error occurs while reading or writing data
- */
- public static File readResourceToTempFile(String resourceName) throws IOException {
- String absoluteResource = resourceName.startsWith("/") ? resourceName : '/' + resourceName;
- log.info("Loading resource {}", absoluteResource);
- InputSupplier<? extends InputStream> inSupplier;
- try {
- URL resourceURL = Resources.getResource(RecommenderWrapper.class, absoluteResource);
- inSupplier = Resources.newInputStreamSupplier(resourceURL);
- } catch (IllegalArgumentException iae) {
- File resourceFile = new File(resourceName);
- log.info("Falling back to load file {}", resourceFile.getAbsolutePath());
- inSupplier = Files.newInputStreamSupplier(resourceFile);
- }
- File tempFile = File.createTempFile("taste", null);
- tempFile.deleteOnExit();
- Files.copy(inSupplier, tempFile);
- return tempFile;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java b/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java
deleted file mode 100644
index 03a3000..0000000
--- a/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier;
-
-import com.google.common.collect.Lists;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.MatrixWritable;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.PrintStream;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Export a ConfusionMatrix in various text formats: ToString version Grayscale HTML table Summary HTML table
- * Table of counts all with optional HTML wrappers
- *
- * Input format: Hadoop SequenceFile with Text key and MatrixWritable value, 1 pair
- *
- * Intended to consume ConfusionMatrix SequenceFile output by Bayes TestClassifier class
- */
-public final class ConfusionMatrixDumper extends AbstractJob {
-
- private static final String TAB_SEPARATOR = "|";
-
- // HTML wrapper - default CSS
- private static final String HEADER = "<html>"
- + "<head>\n"
- + "<title>TITLE</title>\n"
- + "</head>"
- + "<body>\n"
- + "<style type='text/css'> \n"
- + "table\n"
- + "{\n"
- + "border:3px solid black; text-align:left;\n"
- + "}\n"
- + "th.normalHeader\n"
- + "{\n"
- + "border:1px solid black;border-collapse:collapse;text-align:center;"
- + "background-color:white\n"
- + "}\n"
- + "th.tallHeader\n"
- + "{\n"
- + "border:1px solid black;border-collapse:collapse;text-align:center;"
- + "background-color:white; height:6em\n"
- + "}\n"
- + "tr.label\n"
- + "{\n"
- + "border:1px solid black;border-collapse:collapse;text-align:center;"
- + "background-color:white\n"
- + "}\n"
- + "tr.row\n"
- + "{\n"
- + "border:1px solid gray;text-align:center;background-color:snow\n"
- + "}\n"
- + "td\n"
- + "{\n"
- + "min-width:2em\n"
- + "}\n"
- + "td.cell\n"
- + "{\n"
- + "border:1px solid black;text-align:right;background-color:snow\n"
- + "}\n"
- + "td.empty\n"
- + "{\n"
- + "border:0px;text-align:right;background-color:snow\n"
- + "}\n"
- + "td.white\n"
- + "{\n"
- + "border:0px solid black;text-align:right;background-color:white\n"
- + "}\n"
- + "td.black\n"
- + "{\n"
- + "border:0px solid red;text-align:right;background-color:black\n"
- + "}\n"
- + "td.gray1\n"
- + "{\n"
- + "border:0px solid green;text-align:right; background-color:LightGray\n"
- + "}\n" + "td.gray2\n" + "{\n"
- + "border:0px solid blue;text-align:right;background-color:gray\n"
- + "}\n" + "td.gray3\n" + "{\n"
- + "border:0px solid red;text-align:right;background-color:DarkGray\n"
- + "}\n" + "th" + "{\n" + " text-align: center;\n"
- + " vertical-align: bottom;\n"
- + " padding-bottom: 3px;\n" + " padding-left: 5px;\n"
- + " padding-right: 5px;\n" + "}\n" + " .verticalText\n"
- + " {\n" + " text-align: center;\n"
- + " vertical-align: middle;\n" + " width: 20px;\n"
- + " margin: 0px;\n" + " padding: 0px;\n"
- + " padding-left: 3px;\n" + " padding-right: 3px;\n"
- + " padding-top: 10px;\n" + " white-space: nowrap;\n"
- + " -webkit-transform: rotate(-90deg); \n"
- + " -moz-transform: rotate(-90deg); \n" + " };\n"
- + "</style>\n";
- private static final String FOOTER = "</html></body>";
-
- // CSS style names.
- private static final String CSS_TABLE = "table";
- private static final String CSS_LABEL = "label";
- private static final String CSS_TALL_HEADER = "tall";
- private static final String CSS_VERTICAL = "verticalText";
- private static final String CSS_CELL = "cell";
- private static final String CSS_EMPTY = "empty";
- private static final String[] CSS_GRAY_CELLS = {"white", "gray1", "gray2", "gray3", "black"};
-
- private ConfusionMatrixDumper() {}
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new ConfusionMatrixDumper(), args);
- }
-
- @Override
- public int run(String[] args) throws IOException {
- addInputOption();
- addOption("output", "o", "Output path", null); // AbstractJob output feature requires param
- addOption(DefaultOptionCreator.overwriteOption().create());
- addFlag("html", null, "Create complete HTML page");
- addFlag("text", null, "Dump simple text");
- Map<String,List<String>> parsedArgs = parseArguments(args);
- if (parsedArgs == null) {
- return -1;
- }
-
- Path inputPath = getInputPath();
- String outputFile = hasOption("output") ? getOption("output") : null;
- boolean text = parsedArgs.containsKey("--text");
- boolean wrapHtml = parsedArgs.containsKey("--html");
- PrintStream out = getPrintStream(outputFile);
- if (text) {
- exportText(inputPath, out);
- } else {
- exportTable(inputPath, out, wrapHtml);
- }
- out.flush();
- if (out != System.out) {
- out.close();
- }
- return 0;
- }
-
- private static void exportText(Path inputPath, PrintStream out) throws IOException {
- MatrixWritable mw = new MatrixWritable();
- Text key = new Text();
- readSeqFile(inputPath, key, mw);
- Matrix m = mw.get();
- ConfusionMatrix cm = new ConfusionMatrix(m);
- out.println(String.format("%-40s", "Label") + TAB_SEPARATOR + String.format("%-10s", "Total")
- + TAB_SEPARATOR + String.format("%-10s", "Correct") + TAB_SEPARATOR
- + String.format("%-6s", "%") + TAB_SEPARATOR);
- out.println(String.format("%-70s", "-").replace(' ', '-'));
- List<String> labels = stripDefault(cm);
- for (String label : labels) {
- int correct = cm.getCorrect(label);
- double accuracy = cm.getAccuracy(label);
- int count = getCount(cm, label);
- out.println(String.format("%-40s", label) + TAB_SEPARATOR + String.format("%-10s", count)
- + TAB_SEPARATOR + String.format("%-10s", correct) + TAB_SEPARATOR
- + String.format("%-6s", (int) Math.round(accuracy)) + TAB_SEPARATOR);
- }
- out.println(String.format("%-70s", "-").replace(' ', '-'));
- out.println(cm.toString());
- }
-
- private static void exportTable(Path inputPath, PrintStream out, boolean wrapHtml) throws IOException {
- MatrixWritable mw = new MatrixWritable();
- Text key = new Text();
- readSeqFile(inputPath, key, mw);
- String fileName = inputPath.getName();
- fileName = fileName.substring(fileName.lastIndexOf('/') + 1, fileName.length());
- Matrix m = mw.get();
- ConfusionMatrix cm = new ConfusionMatrix(m);
- if (wrapHtml) {
- printHeader(out, fileName);
- }
- out.println("");
- printSummaryTable(cm, out);
- out.println("");
- printGrayTable(cm, out);
- out.println("");
- printCountsTable(cm, out);
- out.println("");
- printTextInBox(cm, out);
- out.println("");
- if (wrapHtml) {
- printFooter(out);
- }
- }
-
- private static List<String> stripDefault(ConfusionMatrix cm) {
- List<String> stripped = Lists.newArrayList(cm.getLabels().iterator());
- String defaultLabel = cm.getDefaultLabel();
- int unclassified = cm.getTotal(defaultLabel);
- if (unclassified > 0) {
- return stripped;
- }
- stripped.remove(defaultLabel);
- return stripped;
- }
-
- // TODO: test - this should work with HDFS files
- private static void readSeqFile(Path path, Text key, MatrixWritable m) throws IOException {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
- reader.next(key, m);
- }
-
- // TODO: test - this might not work with HDFS files?
- // after all, it does no seeks
- private static PrintStream getPrintStream(String outputFilename) throws IOException {
- if (outputFilename != null) {
- File outputFile = new File(outputFilename);
- if (outputFile.exists()) {
- outputFile.delete();
- }
- outputFile.createNewFile();
- OutputStream os = new FileOutputStream(outputFile);
- return new PrintStream(os, false, Charsets.UTF_8.displayName());
- } else {
- return System.out;
- }
- }
-
- private static int getLabelTotal(ConfusionMatrix cm, String rowLabel) {
- Iterator<String> iter = cm.getLabels().iterator();
- int count = 0;
- while (iter.hasNext()) {
- count += cm.getCount(rowLabel, iter.next());
- }
- return count;
- }
-
- // HTML generator code
-
- private static void printTextInBox(ConfusionMatrix cm, PrintStream out) {
- out.println("<div style='width:90%;overflow:scroll;'>");
- out.println("<pre>");
- out.println(cm.toString());
- out.println("</pre>");
- out.println("</div>");
- }
-
- public static void printSummaryTable(ConfusionMatrix cm, PrintStream out) {
- format("<table class='%s'>\n", out, CSS_TABLE);
- format("<tr class='%s'>", out, CSS_LABEL);
- out.println("<td>Label</td><td>Total</td><td>Correct</td><td>%</td>");
- out.println("</tr>");
- List<String> labels = stripDefault(cm);
- for (String label : labels) {
- printSummaryRow(cm, out, label);
- }
- out.println("</table>");
- }
-
- private static void printSummaryRow(ConfusionMatrix cm, PrintStream out, String label) {
- format("<tr class='%s'>", out, CSS_CELL);
- int correct = cm.getCorrect(label);
- double accuracy = cm.getAccuracy(label);
- int count = getCount(cm, label);
- format("<td class='%s'>%s</td><td>%d</td><td>%d</td><td>%d</td>", out, CSS_CELL, label, count, correct,
- (int) Math.round(accuracy));
- out.println("</tr>");
- }
-
- private static int getCount(ConfusionMatrix cm, String label) {
- int count = 0;
- for (String s : cm.getLabels()) {
- count += cm.getCount(label, s);
- }
- return count;
- }
-
- public static void printGrayTable(ConfusionMatrix cm, PrintStream out) {
- format("<table class='%s'>\n", out, CSS_TABLE);
- printCountsHeader(cm, out, true);
- printGrayRows(cm, out);
- out.println("</table>");
- }
-
- /**
- * Print each value in a four-value grayscale based on count/max. Gives a mostly white matrix with grays in
- * misclassified, and black in diagonal. TODO: Using the sqrt(count/max) as the rating is more stringent
- */
- private static void printGrayRows(ConfusionMatrix cm, PrintStream out) {
- List<String> labels = stripDefault(cm);
- for (String label : labels) {
- printGrayRow(cm, out, labels, label);
- }
- }
-
- private static void printGrayRow(ConfusionMatrix cm,
- PrintStream out,
- Iterable<String> labels,
- String rowLabel) {
- format("<tr class='%s'>", out, CSS_LABEL);
- format("<td>%s</td>", out, rowLabel);
- int total = getLabelTotal(cm, rowLabel);
- for (String columnLabel : labels) {
- printGrayCell(cm, out, total, rowLabel, columnLabel);
- }
- out.println("</tr>");
- }
-
- // assign white/light/medium/dark to 0,1/4,1/2,3/4 of total number of inputs
- // assign black to count = total, meaning complete success
- // alternative rating is to use sqrt(total) instead of total - this is more drastic
- private static void printGrayCell(ConfusionMatrix cm,
- PrintStream out,
- int total,
- String rowLabel,
- String columnLabel) {
-
- int count = cm.getCount(rowLabel, columnLabel);
- if (count == 0) {
- out.format("<td class='%s'/>", CSS_EMPTY);
- } else {
- // 0 is white, full is black, everything else gray
- int rating = (int) ((count / (double) total) * 4);
- String css = CSS_GRAY_CELLS[rating];
- format("<td class='%s' title='%s'>%s</td>", out, css, columnLabel, count);
- }
- }
-
- public static void printCountsTable(ConfusionMatrix cm, PrintStream out) {
- format("<table class='%s'>\n", out, CSS_TABLE);
- printCountsHeader(cm, out, false);
- printCountsRows(cm, out);
- out.println("</table>");
- }
-
- private static void printCountsRows(ConfusionMatrix cm, PrintStream out) {
- List<String> labels = stripDefault(cm);
- for (String label : labels) {
- printCountsRow(cm, out, labels, label);
- }
- }
-
- private static void printCountsRow(ConfusionMatrix cm,
- PrintStream out,
- Iterable<String> labels,
- String rowLabel) {
- out.println("<tr>");
- format("<td class='%s'>%s</td>", out, CSS_LABEL, rowLabel);
- for (String columnLabel : labels) {
- printCountsCell(cm, out, rowLabel, columnLabel);
- }
- out.println("</tr>");
- }
-
- private static void printCountsCell(ConfusionMatrix cm, PrintStream out, String rowLabel, String columnLabel) {
- int count = cm.getCount(rowLabel, columnLabel);
- String s = count == 0 ? "" : Integer.toString(count);
- format("<td class='%s' title='%s'>%s</td>", out, CSS_CELL, columnLabel, s);
- }
-
- private static void printCountsHeader(ConfusionMatrix cm, PrintStream out, boolean vertical) {
- List<String> labels = stripDefault(cm);
- int longest = getLongestHeader(labels);
- if (vertical) {
- // do vertical - rotation is a bitch
- out.format("<tr class='%s' style='height:%dem'><th> </th>%n", CSS_TALL_HEADER, longest / 2);
- for (String label : labels) {
- out.format("<th><div class='%s'>%s</div></th>", CSS_VERTICAL, label);
- }
- out.println("</tr>");
- } else {
- // header - empty cell in upper left
- out.format("<tr class='%s'><td class='%s'></td>%n", CSS_TABLE, CSS_LABEL);
- for (String label : labels) {
- out.format("<td>%s</td>", label);
- }
- out.format("</tr>");
- }
- }
-
- private static int getLongestHeader(Iterable<String> labels) {
- int max = 0;
- for (String label : labels) {
- max = Math.max(label.length(), max);
- }
- return max;
- }
-
- private static void format(String format, PrintStream out, Object... args) {
- String format2 = String.format(format, args);
- out.println(format2);
- }
-
- public static void printHeader(PrintStream out, CharSequence title) {
- out.println(HEADER.replace("TITLE", title));
- }
-
- public static void printFooter(PrintStream out) {
- out.println(FOOTER);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java b/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
deleted file mode 100644
index 545c1ff..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
+++ /dev/null
@@ -1,387 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.cdbw;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.GaussianAccumulator;
-import org.apache.mahout.clustering.OnlineGaussianAccumulator;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.Vector.Element;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-/**
- * This class calculates the CDbw metric as defined in
- * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf
- */
-public final class CDbwEvaluator {
-
- private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class);
-
- private final Map<Integer,List<VectorWritable>> representativePoints;
- private final Map<Integer,Double> stDevs = new HashMap<>();
- private final List<Cluster> clusters;
- private final DistanceMeasure measure;
- private Double interClusterDensity = null;
- // these are symmetric so we only compute half of them
- private Map<Integer,Map<Integer,Double>> minimumDistances = null;
- // these are symmetric too
- private Map<Integer,Map<Integer,Double>> interClusterDensities = null;
- // these are symmetric too
- private Map<Integer,Map<Integer,int[]>> closestRepPointIndices = null;
-
- /**
- * For testing only
- *
- * @param representativePoints
- * a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
- * @param clusters
- * a Map<Integer,Cluster> of the clusters keyed by clusterId
- * @param measure
- * an appropriate DistanceMeasure
- */
- public CDbwEvaluator(Map<Integer,List<VectorWritable>> representativePoints, List<Cluster> clusters,
- DistanceMeasure measure) {
- this.representativePoints = representativePoints;
- this.clusters = clusters;
- this.measure = measure;
- for (Integer cId : representativePoints.keySet()) {
- computeStd(cId);
- }
- }
-
- /**
- * Initialize a new instance from job information
- *
- * @param conf
- * a Configuration with appropriate parameters
- * @param clustersIn
- * a String path to the input clusters directory
- */
- public CDbwEvaluator(Configuration conf, Path clustersIn) {
- measure = ClassUtils
- .instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
- representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
- clusters = loadClusters(conf, clustersIn);
- for (Integer cId : representativePoints.keySet()) {
- computeStd(cId);
- }
- }
-
- /**
- * Load the clusters from their sequence files
- *
- * @param clustersIn
- * a String pathname to the directory containing input cluster files
- * @return a List<Cluster> of the clusters
- */
- private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) {
- List<Cluster> clusters = new ArrayList<>();
- for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
- PathFilters.logsCRCFilter(), conf)) {
- Cluster cluster = clusterWritable.getValue();
- clusters.add(cluster);
- }
- return clusters;
- }
-
- /**
- * Compute the standard deviation of the representative points for the given cluster. Store these in stDevs, indexed
- * by cI
- *
- * @param cI
- * a int clusterId.
- */
- private void computeStd(int cI) {
- List<VectorWritable> repPts = representativePoints.get(cI);
- GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
- for (VectorWritable vw : repPts) {
- accumulator.observe(vw.get(), 1.0);
- }
- accumulator.compute();
- double d = accumulator.getAverageStd();
- stDevs.put(cI, d);
- }
-
- /**
- * Compute the density of points near the midpoint between the two closest points of the clusters (eqn 2) used for
- * inter-cluster density calculation
- *
- * @param uIJ
- * the Vector midpoint between the closest representative points of the clusters
- * @param cI
- * the int clusterId of the i-th cluster
- * @param cJ
- * the int clusterId of the j-th cluster
- * @param avgStd
- * the double average standard deviation of the two clusters
- * @return a double
- */
- private double density(Vector uIJ, int cI, int cJ, double avgStd) {
- List<VectorWritable> repI = representativePoints.get(cI);
- List<VectorWritable> repJ = representativePoints.get(cJ);
- double sum = 0.0;
- // count the number of representative points of the clusters which are within the
- // average std of the two clusters from the midpoint uIJ (eqn 3)
- for (VectorWritable vwI : repI) {
- if (uIJ != null && measure.distance(uIJ, vwI.get()) <= avgStd) {
- sum++;
- }
- }
- for (VectorWritable vwJ : repJ) {
- if (uIJ != null && measure.distance(uIJ, vwJ.get()) <= avgStd) {
- sum++;
- }
- }
- int nI = repI.size();
- int nJ = repJ.size();
- return sum / (nI + nJ);
- }
-
- /**
- * Compute the CDbw validity metric (eqn 8). The goal of this metric is to reward clusterings which have a high
- * intraClusterDensity and also a high cluster separation.
- *
- * @return a double
- */
- public double getCDbw() {
- return intraClusterDensity() * separation();
- }
-
- /**
- * The average density within clusters is defined as the percentage of representative points that reside in the
- * neighborhood of the clusters' centers. The goal is the density within clusters to be significantly high. (eqn 5)
- *
- * @return a double
- */
- public double intraClusterDensity() {
- double avgDensity = 0;
- int count = 0;
- for (Element elem : intraClusterDensities().nonZeroes()) {
- double value = elem.get();
- if (!Double.isNaN(value)) {
- avgDensity += value;
- count++;
- }
- }
- return avgDensity / count;
- }
-
- /**
- * This function evaluates the density of points in the regions between each clusters (eqn 1). The goal is the density
- * in the area between clusters to be significant low.
- *
- * @return a Map<Integer,Map<Integer,Double>> of the inter-cluster densities
- */
- public Map<Integer,Map<Integer,Double>> interClusterDensities() {
- if (interClusterDensities != null) {
- return interClusterDensities;
- }
- interClusterDensities = new TreeMap<>();
- // find the closest representative points between the clusters
- for (int i = 0; i < clusters.size(); i++) {
- int cI = clusters.get(i).getId();
- Map<Integer,Double> map = new TreeMap<>();
- interClusterDensities.put(cI, map);
- for (int j = i + 1; j < clusters.size(); j++) {
- int cJ = clusters.get(j).getId();
- double minDistance = minimumDistance(cI, cJ); // the distance between the closest representative points
- Vector uIJ = midpointVector(cI, cJ); // the midpoint between the closest representative points
- double stdSum = stDevs.get(cI) + stDevs.get(cJ);
- double density = density(uIJ, cI, cJ, stdSum / 2);
- double interDensity = minDistance * density / stdSum;
- map.put(cJ, interDensity);
- if (log.isDebugEnabled()) {
- log.debug("minDistance[{},{}]={}", cI, cJ, minDistance);
- log.debug("interDensity[{},{}]={}", cI, cJ, density);
- log.debug("density[{},{}]={}", cI, cJ, interDensity);
- }
- }
- }
- return interClusterDensities;
- }
-
- /**
- * Calculate the separation of clusters (eqn 4) taking into account both the distances between the clusters' closest
- * points and the Inter-cluster density. The goal is the distances between clusters to be high while the
- * representative point density in the areas between them are low.
- *
- * @return a double
- */
- public double separation() {
- double minDistanceSum = 0;
- Map<Integer,Map<Integer,Double>> distances = minimumDistances();
- for (Map<Integer,Double> map : distances.values()) {
- for (Double dist : map.values()) {
- if (!Double.isInfinite(dist)) {
- minDistanceSum += dist * 2; // account for other half of calculated triangular minimumDistances matrix
- }
- }
- }
- return minDistanceSum / (1.0 + interClusterDensity());
- }
-
- /**
- * This function evaluates the average density of points in the regions between clusters (eqn 1). The goal is the
- * density in the area between clusters to be significant low.
- *
- * @return a double
- */
- public double interClusterDensity() {
- if (interClusterDensity != null) {
- return interClusterDensity;
- }
- double sum = 0.0;
- int count = 0;
- Map<Integer,Map<Integer,Double>> distances = interClusterDensities();
- for (Map<Integer,Double> row : distances.values()) {
- for (Double density : row.values()) {
- if (!Double.isNaN(density)) {
- sum += density;
- count++;
- }
- }
- }
- log.debug("interClusterDensity={}", sum);
- interClusterDensity = sum / count;
- return interClusterDensity;
- }
-
- /**
- * The average density within clusters is defined as the percentage of representative points that reside in the
- * neighborhood of the clusters' centers. The goal is the density within clusters to be significantly high. (eqn 5)
- *
- * @return a Vector of the intra-densities of each clusterId
- */
- public Vector intraClusterDensities() {
- Vector densities = new RandomAccessSparseVector(Integer.MAX_VALUE);
- // compute the average standard deviation of the clusters
- double stdev = 0.0;
- for (Integer cI : representativePoints.keySet()) {
- stdev += stDevs.get(cI);
- }
- int c = representativePoints.size();
- stdev /= c;
- for (Cluster cluster : clusters) {
- Integer cI = cluster.getId();
- List<VectorWritable> repPtsI = representativePoints.get(cI);
- int r = repPtsI.size();
- double sumJ = 0.0;
- // compute the term density (eqn 6)
- for (VectorWritable pt : repPtsI) {
- // compute f(x, vIJ) (eqn 7)
- Vector repJ = pt.get();
- double densityIJ = measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0;
- // accumulate sumJ
- sumJ += densityIJ / stdev;
- }
- densities.set(cI, sumJ / r);
- }
- return densities;
- }
-
- /**
- * Calculate and cache the distances between the clusters' closest representative points. Also cache the indices of
- * the closest representative points used for later use
- *
- * @return a Map<Integer,Vector> of the closest distances, keyed by clusterId
- */
- private Map<Integer,Map<Integer,Double>> minimumDistances() {
- if (minimumDistances != null) {
- return minimumDistances;
- }
- minimumDistances = new TreeMap<>();
- closestRepPointIndices = new TreeMap<>();
- for (int i = 0; i < clusters.size(); i++) {
- Integer cI = clusters.get(i).getId();
- Map<Integer,Double> map = new TreeMap<>();
- Map<Integer,int[]> treeMap = new TreeMap<>();
- closestRepPointIndices.put(cI, treeMap);
- minimumDistances.put(cI, map);
- List<VectorWritable> closRepI = representativePoints.get(cI);
- for (int j = i + 1; j < clusters.size(); j++) {
- // find min{d(closRepI, closRepJ)}
- Integer cJ = clusters.get(j).getId();
- List<VectorWritable> closRepJ = representativePoints.get(cJ);
- double minDistance = Double.MAX_VALUE;
- int[] midPointIndices = null;
- for (int xI = 0; xI < closRepI.size(); xI++) {
- VectorWritable aRepI = closRepI.get(xI);
- for (int xJ = 0; xJ < closRepJ.size(); xJ++) {
- VectorWritable aRepJ = closRepJ.get(xJ);
- double distance = measure.distance(aRepI.get(), aRepJ.get());
- if (distance < minDistance) {
- minDistance = distance;
- midPointIndices = new int[] {xI, xJ};
- }
- }
- }
- map.put(cJ, minDistance);
- treeMap.put(cJ, midPointIndices);
- }
- }
- return minimumDistances;
- }
-
- private double minimumDistance(int cI, int cJ) {
- Map<Integer,Double> distances = minimumDistances().get(cI);
- if (distances != null) {
- return distances.get(cJ);
- } else {
- return minimumDistances().get(cJ).get(cI);
- }
- }
-
- private Vector midpointVector(int cI, int cJ) {
- Map<Integer,Double> distances = minimumDistances().get(cI);
- if (distances != null) {
- int[] ks = closestRepPointIndices.get(cI).get(cJ);
- if (ks == null) {
- return null;
- }
- return representativePoints.get(cI).get(ks[0]).get().plus(representativePoints.get(cJ).get(ks[1]).get())
- .divide(2);
- } else {
- int[] ks = closestRepPointIndices.get(cJ).get(cI);
- if (ks == null) {
- return null;
- }
- return representativePoints.get(cJ).get(ks[1]).get().plus(representativePoints.get(cI).get(ks[0]).get())
- .divide(2);
- }
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java b/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java
deleted file mode 100644
index 6a2b376..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.conversion;
-
-import java.io.IOException;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class converts text files containing space-delimited floating point numbers into
- * Mahout sequence files of VectorWritable suitable for input to the clustering jobs in
- * particular, and any Mahout job requiring this input in general.
- *
- */
-public final class InputDriver {
-
- private static final Logger log = LoggerFactory.getLogger(InputDriver.class);
-
- private InputDriver() {
- }
-
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
- Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
- Option vectorOpt = obuilder.withLongName("vector").withRequired(false).withArgument(
- abuilder.withName("v").withMinimum(1).withMaximum(1).create()).withDescription(
- "The vector implementation to use.").withShortName("v").create();
-
- Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
- vectorOpt).withOption(helpOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
-
- Path input = new Path(cmdLine.getValue(inputOpt, "testdata").toString());
- Path output = new Path(cmdLine.getValue(outputOpt, "output").toString());
- String vectorClassName = cmdLine.getValue(vectorOpt,
- "org.apache.mahout.math.RandomAccessSparseVector").toString();
- runJob(input, output, vectorClassName);
- } catch (OptionException e) {
- log.error("Exception parsing command line: ", e);
- CommandLineUtil.printHelp(group);
- }
- }
-
- public static void runJob(Path input, Path output, String vectorClassName)
- throws IOException, InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
- conf.set("vector.implementation.class.name", vectorClassName);
- Job job = new Job(conf, "Input Driver running over input: " + input);
-
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(VectorWritable.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
- job.setMapperClass(InputMapper.class);
- job.setNumReduceTasks(0);
- job.setJarByClass(InputDriver.class);
-
- FileInputFormat.addInputPath(job, input);
- FileOutputFormat.setOutputPath(job, output);
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- throw new IllegalStateException("Job failed!");
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java b/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java
deleted file mode 100644
index e4c72c6..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.conversion;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-import java.io.IOException;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.regex.Pattern;
-
-public class InputMapper extends Mapper<LongWritable, Text, Text, VectorWritable> {
-
- private static final Pattern SPACE = Pattern.compile(" ");
-
- private Constructor<?> constructor;
-
- @Override
- protected void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException {
-
- String[] numbers = SPACE.split(values.toString());
- // sometimes there are multiple separator spaces
- Collection<Double> doubles = new ArrayList<>();
- for (String value : numbers) {
- if (!value.isEmpty()) {
- doubles.add(Double.valueOf(value));
- }
- }
- // ignore empty lines in data file
- if (!doubles.isEmpty()) {
- try {
- Vector result = (Vector) constructor.newInstance(doubles.size());
- int index = 0;
- for (Double d : doubles) {
- result.set(index++, d);
- }
- VectorWritable vectorWritable = new VectorWritable(result);
- context.write(new Text(String.valueOf(index)), vectorWritable);
-
- } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
- throw new IllegalStateException(e);
- }
- }
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Configuration conf = context.getConfiguration();
- String vectorImplClassName = conf.get("vector.implementation.class.name");
- try {
- Class<? extends Vector> outputClass = conf.getClassByName(vectorImplClassName).asSubclass(Vector.class);
- constructor = outputClass.getConstructor(int.class);
- } catch (NoSuchMethodException | ClassNotFoundException e) {
- throw new IllegalStateException(e);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java b/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
deleted file mode 100644
index 757f38c..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
+++ /dev/null
@@ -1,196 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.evaluation;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.Vector.Element;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-public class ClusterEvaluator {
-
- private static final Logger log = LoggerFactory.getLogger(ClusterEvaluator.class);
-
- private final Map<Integer,List<VectorWritable>> representativePoints;
-
- private final List<Cluster> clusters;
-
- private final DistanceMeasure measure;
-
- /**
- * For testing only
- *
- * @param representativePoints
- * a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
- * @param clusters
- * a Map<Integer,Cluster> of the clusters keyed by clusterId
- * @param measure
- * an appropriate DistanceMeasure
- */
- public ClusterEvaluator(Map<Integer,List<VectorWritable>> representativePoints, List<Cluster> clusters,
- DistanceMeasure measure) {
- this.representativePoints = representativePoints;
- this.clusters = clusters;
- this.measure = measure;
- }
-
- /**
- * Initialize a new instance from job information
- *
- * @param conf
- * a Configuration with appropriate parameters
- * @param clustersIn
- * a String path to the input clusters directory
- */
- public ClusterEvaluator(Configuration conf, Path clustersIn) {
- measure = ClassUtils
- .instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
- representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
- clusters = loadClusters(conf, clustersIn);
- }
-
- /**
- * Load the clusters from their sequence files
- *
- * @param clustersIn
- * a String pathname to the directory containing input cluster files
- * @return a List<Cluster> of the clusters
- */
- private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) {
- List<Cluster> clusters = new ArrayList<>();
- for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
- PathFilters.logsCRCFilter(), conf)) {
- Cluster cluster = clusterWritable.getValue();
- clusters.add(cluster);
- }
- return clusters;
- }
-
- /**
- * Computes the inter-cluster density as defined in "Mahout In Action"
- *
- * @return the interClusterDensity
- */
- public double interClusterDensity() {
- double max = Double.NEGATIVE_INFINITY;
- double min = Double.POSITIVE_INFINITY;
- double sum = 0;
- int count = 0;
- Map<Integer,Vector> distances = interClusterDistances();
- for (Vector row : distances.values()) {
- for (Element element : row.nonZeroes()) {
- double d = element.get();
- min = Math.min(d, min);
- max = Math.max(d, max);
- sum += d;
- count++;
- }
- }
- double density = (sum / count - min) / (max - min);
- log.info("Scaled Inter-Cluster Density = {}", density);
- return density;
- }
-
- /**
- * Computes the inter-cluster distances
- *
- * @return a Map<Integer, Vector>
- */
- public Map<Integer,Vector> interClusterDistances() {
- Map<Integer,Vector> distances = new TreeMap<>();
- for (int i = 0; i < clusters.size(); i++) {
- Cluster clusterI = clusters.get(i);
- RandomAccessSparseVector row = new RandomAccessSparseVector(Integer.MAX_VALUE);
- distances.put(clusterI.getId(), row);
- for (int j = i + 1; j < clusters.size(); j++) {
- Cluster clusterJ = clusters.get(j);
- double d = measure.distance(clusterI.getCenter(), clusterJ.getCenter());
- row.set(clusterJ.getId(), d);
- }
- }
- return distances;
- }
-
- /**
- * Computes the average intra-cluster density as the average of each cluster's intra-cluster density
- *
- * @return the average intraClusterDensity
- */
- public double intraClusterDensity() {
- double avgDensity = 0;
- int count = 0;
- for (Element elem : intraClusterDensities().nonZeroes()) {
- double value = elem.get();
- if (!Double.isNaN(value)) {
- avgDensity += value;
- count++;
- }
- }
- avgDensity = clusters.isEmpty() ? 0 : avgDensity / count;
- log.info("Average Intra-Cluster Density = {}", avgDensity);
- return avgDensity;
- }
-
- /**
- * Computes the intra-cluster densities for all clusters as the average distance of the representative points from
- * each other
- *
- * @return a Vector of the intraClusterDensity of the representativePoints by clusterId
- */
- public Vector intraClusterDensities() {
- Vector densities = new RandomAccessSparseVector(Integer.MAX_VALUE);
- for (Cluster cluster : clusters) {
- int count = 0;
- double max = Double.NEGATIVE_INFINITY;
- double min = Double.POSITIVE_INFINITY;
- double sum = 0;
- List<VectorWritable> repPoints = representativePoints.get(cluster.getId());
- for (int i = 0; i < repPoints.size(); i++) {
- for (int j = i + 1; j < repPoints.size(); j++) {
- Vector v1 = repPoints.get(i).get();
- Vector v2 = repPoints.get(j).get();
- double d = measure.distance(v1, v2);
- min = Math.min(d, min);
- max = Math.max(d, max);
- sum += d;
- count++;
- }
- }
- double density = (sum / count - min) / (max - min);
- densities.set(cluster.getId(), density);
- log.info("Intra-Cluster Density[{}] = {}", cluster.getId(), density);
- }
- return densities;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java b/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
deleted file mode 100644
index 2fe37ef..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
+++ /dev/null
@@ -1,243 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.evaluation;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class RepresentativePointsDriver extends AbstractJob {
-
- public static final String STATE_IN_KEY = "org.apache.mahout.clustering.stateIn";
-
- public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.measure";
-
- private static final Logger log = LoggerFactory.getLogger(RepresentativePointsDriver.class);
-
- private RepresentativePointsDriver() {}
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new Configuration(), new RepresentativePointsDriver(), args);
- }
-
- @Override
- public int run(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
- addInputOption();
- addOutputOption();
- addOption("clusteredPoints", "cp", "The path to the clustered points", true);
- addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.maxIterationsOption().create());
- addOption(DefaultOptionCreator.methodOption().create());
- if (parseArguments(args) == null) {
- return -1;
- }
-
- Path input = getInputPath();
- Path output = getOutputPath();
- String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
- int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
- DefaultOptionCreator.SEQUENTIAL_METHOD);
- DistanceMeasure measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class);
- Path clusteredPoints = new Path(getOption("clusteredPoints"));
- run(getConf(), input, clusteredPoints, output, measure, maxIterations, runSequential);
- return 0;
- }
-
- /**
- * Utility to print out representative points
- *
- * @param output
- * the Path to the directory containing representativePoints-i folders
- * @param numIterations
- * the int number of iterations to print
- */
- public static void printRepresentativePoints(Path output, int numIterations) {
- for (int i = 0; i <= numIterations; i++) {
- Path out = new Path(output, "representativePoints-" + i);
- System.out.println("Representative Points for iteration " + i);
- Configuration conf = new Configuration();
- for (Pair<IntWritable,VectorWritable> record : new SequenceFileDirIterable<IntWritable,VectorWritable>(out,
- PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) {
- System.out.println("\tC-" + record.getFirst().get() + ": "
- + AbstractCluster.formatVector(record.getSecond().get(), null));
- }
- }
- }
-
- public static void run(Configuration conf, Path clustersIn, Path clusteredPointsIn, Path output,
- DistanceMeasure measure, int numIterations, boolean runSequential) throws IOException, InterruptedException,
- ClassNotFoundException {
- Path stateIn = new Path(output, "representativePoints-0");
- writeInitialState(stateIn, clustersIn);
-
- for (int iteration = 0; iteration < numIterations; iteration++) {
- log.info("Representative Points Iteration {}", iteration);
- // point the output to a new directory per iteration
- Path stateOut = new Path(output, "representativePoints-" + (iteration + 1));
- runIteration(conf, clusteredPointsIn, stateIn, stateOut, measure, runSequential);
- // now point the input to the old output directory
- stateIn = stateOut;
- }
-
- conf.set(STATE_IN_KEY, stateIn.toString());
- conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
- }
-
- private static void writeInitialState(Path output, Path clustersIn) throws IOException {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(output.toUri(), conf);
- for (FileStatus dir : fs.globStatus(clustersIn)) {
- Path inPath = dir.getPath();
- for (FileStatus part : fs.listStatus(inPath, PathFilters.logsCRCFilter())) {
- Path inPart = part.getPath();
- Path path = new Path(output, inPart.getName());
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class)){
- for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(inPart, true, conf)) {
- Cluster cluster = clusterWritable.getValue();
- if (log.isDebugEnabled()) {
- log.debug("C-{}: {}", cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null));
- }
- writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
- }
- }
- }
- }
- }
-
- private static void runIteration(Configuration conf, Path clusteredPointsIn, Path stateIn, Path stateOut,
- DistanceMeasure measure, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
- if (runSequential) {
- runIterationSeq(conf, clusteredPointsIn, stateIn, stateOut, measure);
- } else {
- runIterationMR(conf, clusteredPointsIn, stateIn, stateOut, measure);
- }
- }
-
- /**
- * Run the job using supplied arguments as a sequential process
- *
- * @param conf
- * the Configuration to use
- * @param clusteredPointsIn
- * the directory pathname for input points
- * @param stateIn
- * the directory pathname for input state
- * @param stateOut
- * the directory pathname for output state
- * @param measure
- * the DistanceMeasure to use
- */
- private static void runIterationSeq(Configuration conf, Path clusteredPointsIn, Path stateIn, Path stateOut,
- DistanceMeasure measure) throws IOException {
-
- Map<Integer,List<VectorWritable>> repPoints = RepresentativePointsMapper.getRepresentativePoints(conf, stateIn);
- Map<Integer,WeightedVectorWritable> mostDistantPoints = new HashMap<>();
- FileSystem fs = FileSystem.get(clusteredPointsIn.toUri(), conf);
- for (Pair<IntWritable,WeightedVectorWritable> record
- : new SequenceFileDirIterable<IntWritable,WeightedVectorWritable>(clusteredPointsIn, PathType.LIST,
- PathFilters.logsCRCFilter(), null, true, conf)) {
- RepresentativePointsMapper.mapPoint(record.getFirst(), record.getSecond(), measure, repPoints, mostDistantPoints);
- }
- int part = 0;
- try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(stateOut, "part-m-" + part++),
- IntWritable.class, VectorWritable.class)){
- for (Entry<Integer,List<VectorWritable>> entry : repPoints.entrySet()) {
- for (VectorWritable vw : entry.getValue()) {
- writer.append(new IntWritable(entry.getKey()), vw);
- }
- }
- }
- try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(stateOut, "part-m-" + part++),
- IntWritable.class, VectorWritable.class)){
- for (Map.Entry<Integer,WeightedVectorWritable> entry : mostDistantPoints.entrySet()) {
- writer.append(new IntWritable(entry.getKey()), new VectorWritable(entry.getValue().getVector()));
- }
- }
- }
-
- /**
- * Run the job using supplied arguments as a Map/Reduce process
- *
- * @param conf
- * the Configuration to use
- * @param input
- * the directory pathname for input points
- * @param stateIn
- * the directory pathname for input state
- * @param stateOut
- * the directory pathname for output state
- * @param measure
- * the DistanceMeasure to use
- */
- private static void runIterationMR(Configuration conf, Path input, Path stateIn, Path stateOut,
- DistanceMeasure measure) throws IOException, InterruptedException, ClassNotFoundException {
- conf.set(STATE_IN_KEY, stateIn.toString());
- conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
- Job job = new Job(conf, "Representative Points Driver running over input: " + input);
- job.setJarByClass(RepresentativePointsDriver.class);
- job.setOutputKeyClass(IntWritable.class);
- job.setOutputValueClass(VectorWritable.class);
- job.setMapOutputKeyClass(IntWritable.class);
- job.setMapOutputValueClass(WeightedVectorWritable.class);
-
- FileInputFormat.setInputPaths(job, input);
- FileOutputFormat.setOutputPath(job, stateOut);
-
- job.setMapperClass(RepresentativePointsMapper.class);
- job.setReducerClass(RepresentativePointsReducer.class);
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- throw new IllegalStateException("Job failed!");
- }
- }
-}

r***@apache.org

2018-06-27 14:51:59 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java b/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
deleted file mode 100644
index 0ae79ad..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.evaluation;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.VectorWritable;
-
-public class RepresentativePointsMapper
- extends Mapper<IntWritable, WeightedVectorWritable, IntWritable, WeightedVectorWritable> {
-
- private Map<Integer, List<VectorWritable>> representativePoints;
- private final Map<Integer, WeightedVectorWritable> mostDistantPoints = new HashMap<>();
- private DistanceMeasure measure = new EuclideanDistanceMeasure();
-
- @Override
- protected void cleanup(Context context) throws IOException, InterruptedException {
- for (Map.Entry<Integer, WeightedVectorWritable> entry : mostDistantPoints.entrySet()) {
- context.write(new IntWritable(entry.getKey()), entry.getValue());
- }
- super.cleanup(context);
- }
-
- @Override
- protected void map(IntWritable clusterId, WeightedVectorWritable point, Context context)
- throws IOException, InterruptedException {
- mapPoint(clusterId, point, measure, representativePoints, mostDistantPoints);
- }
-
- public static void mapPoint(IntWritable clusterId,
- WeightedVectorWritable point,
- DistanceMeasure measure,
- Map<Integer, List<VectorWritable>> representativePoints,
- Map<Integer, WeightedVectorWritable> mostDistantPoints) {
- int key = clusterId.get();
- WeightedVectorWritable currentMDP = mostDistantPoints.get(key);
-
- List<VectorWritable> repPoints = representativePoints.get(key);
- double totalDistance = 0.0;
- if (repPoints != null) {
- for (VectorWritable refPoint : repPoints) {
- totalDistance += measure.distance(refPoint.get(), point.getVector());
- }
- }
- if (currentMDP == null || currentMDP.getWeight() < totalDistance) {
- mostDistantPoints.put(key, new WeightedVectorWritable(totalDistance, point.getVector().clone()));
- }
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Configuration conf = context.getConfiguration();
- measure =
- ClassUtils.instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
- representativePoints = getRepresentativePoints(conf);
- }
-
- public void configure(Map<Integer, List<VectorWritable>> referencePoints, DistanceMeasure measure) {
- this.representativePoints = referencePoints;
- this.measure = measure;
- }
-
- public static Map<Integer, List<VectorWritable>> getRepresentativePoints(Configuration conf) {
- String statePath = conf.get(RepresentativePointsDriver.STATE_IN_KEY);
- return getRepresentativePoints(conf, new Path(statePath));
- }
-
- public static Map<Integer, List<VectorWritable>> getRepresentativePoints(Configuration conf, Path statePath) {
- Map<Integer, List<VectorWritable>> representativePoints = new HashMap<>();
- for (Pair<IntWritable,VectorWritable> record
- : new SequenceFileDirIterable<IntWritable,VectorWritable>(statePath,
- PathType.LIST,
- PathFilters.logsCRCFilter(),
- conf)) {
- int keyValue = record.getFirst().get();
- List<VectorWritable> repPoints = representativePoints.get(keyValue);
- if (repPoints == null) {
- repPoints = new ArrayList<>();
- representativePoints.put(keyValue, repPoints);
- }
- repPoints.add(record.getSecond());
- }
- return representativePoints;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java b/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
deleted file mode 100644
index 27ca861..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.evaluation;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.math.VectorWritable;
-
-public class RepresentativePointsReducer
- extends Reducer<IntWritable, WeightedVectorWritable, IntWritable, VectorWritable> {
-
- private Map<Integer, List<VectorWritable>> representativePoints;
-
- @Override
- protected void cleanup(Context context) throws IOException, InterruptedException {
- for (Map.Entry<Integer, List<VectorWritable>> entry : representativePoints.entrySet()) {
- IntWritable iw = new IntWritable(entry.getKey());
- for (VectorWritable vw : entry.getValue()) {
- context.write(iw, vw);
- }
- }
- super.cleanup(context);
- }
-
- @Override
- protected void reduce(IntWritable key, Iterable<WeightedVectorWritable> values, Context context)
- throws IOException, InterruptedException {
- // find the most distant point
- WeightedVectorWritable mdp = null;
- for (WeightedVectorWritable dpw : values) {
- if (mdp == null || mdp.getWeight() < dpw.getWeight()) {
- mdp = new WeightedVectorWritable(dpw.getWeight(), dpw.getVector());
- }
- }
- context.write(new IntWritable(key.get()), new VectorWritable(mdp.getVector()));
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Configuration conf = context.getConfiguration();
- representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
- }
-
- public void configure(Map<Integer, List<VectorWritable>> representativePoints) {
- this.representativePoints = representativePoints;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java b/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
deleted file mode 100644
index 392909e..0000000
--- a/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
+++ /dev/null
@@ -1,229 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.lda;
-
-import com.google.common.io.Closeables;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.PriorityQueue;
-import java.util.Queue;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DoubleWritable;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.IntPairWritable;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.utils.vectors.VectorHelper;
-
-/**
- * Class to print out the top K words for each topic.
- */
-public final class LDAPrintTopics {
-
- private LDAPrintTopics() { }
-
- // Expands the queue list to have a Queue for topic K
- private static void ensureQueueSize(Collection<Queue<Pair<String,Double>>> queues, int k) {
- for (int i = queues.size(); i <= k; ++i) {
- queues.add(new PriorityQueue<Pair<String,Double>>());
- }
- }
-
- public static void main(String[] args) throws Exception {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt = DefaultOptionCreator.inputOption().create();
-
- Option dictOpt = obuilder.withLongName("dict").withRequired(true).withArgument(
- abuilder.withName("dict").withMinimum(1).withMaximum(1).create()).withDescription(
- "Dictionary to read in, in the same format as one created by "
- + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName("d").create();
-
- Option outOpt = DefaultOptionCreator.outputOption().create();
-
- Option wordOpt = obuilder.withLongName("words").withRequired(false).withArgument(
- abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault("20").create()).withDescription(
- "Number of words to print").withShortName("w").create();
- Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
- abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
- "The dictionary file type (text|sequencefile)").withShortName("dt").create();
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
- .create();
-
- Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(outOpt).withOption(wordOpt)
- .withOption(inputOpt).withOption(dictTypeOpt).create();
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
-
- String input = cmdLine.getValue(inputOpt).toString();
- String dictFile = cmdLine.getValue(dictOpt).toString();
- int numWords = 20;
- if (cmdLine.hasOption(wordOpt)) {
- numWords = Integer.parseInt(cmdLine.getValue(wordOpt).toString());
- }
- Configuration config = new Configuration();
-
- String dictionaryType = "text";
- if (cmdLine.hasOption(dictTypeOpt)) {
- dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
- }
-
- List<String> wordList;
- if ("text".equals(dictionaryType)) {
- wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(dictFile)));
- } else if ("sequencefile".equals(dictionaryType)) {
- wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, dictFile));
- } else {
- throw new IllegalArgumentException("Invalid dictionary format");
- }
-
- List<Queue<Pair<String,Double>>> topWords = topWordsForTopics(input, config, wordList, numWords);
-
- File output = null;
- if (cmdLine.hasOption(outOpt)) {
- output = new File(cmdLine.getValue(outOpt).toString());
- if (!output.exists() && !output.mkdirs()) {
- throw new IOException("Could not create directory: " + output);
- }
- }
- printTopWords(topWords, output);
- } catch (OptionException e) {
- CommandLineUtil.printHelp(group);
- throw e;
- }
- }
-
- // Adds the word if the queue is below capacity, or the score is high enough
- private static void maybeEnqueue(Queue<Pair<String,Double>> q, String word, double score, int numWordsToPrint) {
- if (q.size() >= numWordsToPrint && score > q.peek().getSecond()) {
- q.poll();
- }
- if (q.size() < numWordsToPrint) {
- q.add(new Pair<>(word, score));
- }
- }
-
- private static void printTopWords(List<Queue<Pair<String,Double>>> topWords, File outputDir)
- throws IOException {
- for (int i = 0; i < topWords.size(); ++i) {
- Collection<Pair<String,Double>> topK = topWords.get(i);
- Writer out = null;
- boolean printingToSystemOut = false;
- try {
- if (outputDir != null) {
- out = new OutputStreamWriter(new FileOutputStream(new File(outputDir, "topic_" + i)), Charsets.UTF_8);
- } else {
- out = new OutputStreamWriter(System.out, Charsets.UTF_8);
- printingToSystemOut = true;
- out.write("Topic " + i);
- out.write('\n');
- out.write("===========");
- out.write('\n');
- }
- List<Pair<String,Double>> topKasList = new ArrayList<>(topK.size());
- for (Pair<String,Double> wordWithScore : topK) {
- topKasList.add(wordWithScore);
- }
- Collections.sort(topKasList, new Comparator<Pair<String,Double>>() {
- @Override
- public int compare(Pair<String,Double> pair1, Pair<String,Double> pair2) {
- return pair2.getSecond().compareTo(pair1.getSecond());
- }
- });
- for (Pair<String,Double> wordWithScore : topKasList) {
- out.write(wordWithScore.getFirst() + " [p(" + wordWithScore.getFirst() + "|topic_" + i + ") = "
- + wordWithScore.getSecond());
- out.write('\n');
- }
- } finally {
- if (!printingToSystemOut) {
- Closeables.close(out, false);
- } else {
- out.flush();
- }
- }
- }
- }
-
- private static List<Queue<Pair<String,Double>>> topWordsForTopics(String dir,
- Configuration job,
- List<String> wordList,
- int numWordsToPrint) {
- List<Queue<Pair<String,Double>>> queues = new ArrayList<>();
- Map<Integer,Double> expSums = new HashMap<>();
- for (Pair<IntPairWritable,DoubleWritable> record
- : new SequenceFileDirIterable<IntPairWritable, DoubleWritable>(
- new Path(dir, "part-*"), PathType.GLOB, null, null, true, job)) {
- IntPairWritable key = record.getFirst();
- int topic = key.getFirst();
- int word = key.getSecond();
- ensureQueueSize(queues, topic);
- if (word >= 0 && topic >= 0) {
- double score = record.getSecond().get();
- if (expSums.get(topic) == null) {
- expSums.put(topic, 0.0);
- }
- expSums.put(topic, expSums.get(topic) + Math.exp(score));
- String realWord = wordList.get(word);
- maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
- }
- }
- for (int i = 0; i < queues.size(); i++) {
- Queue<Pair<String,Double>> queue = queues.get(i);
- Queue<Pair<String,Double>> newQueue = new PriorityQueue<>(queue.size());
- double norm = expSums.get(i);
- for (Pair<String,Double> pair : queue) {
- newQueue.add(new Pair<>(pair.getFirst(), Math.exp(pair.getSecond()) / norm));
- }
- queues.set(i, newQueue);
- }
- return queues;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
deleted file mode 100644
index 12ed471..0000000
--- a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.en.PorterStemFilter;
-import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Custom Lucene Analyzer designed for aggressive feature reduction
- * for clustering the ASF Mail Archives using an extended set of
- * stop words, excluding non-alpha-numeric tokens, and porter stemming.
- */
-public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
- // extended set of stop words composed of common mail terms like "hi",
- // HTML tags, and Java keywords asmany of the messages in the archives
- // are subversion check-in notifications
-
- private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList(
- "3d","7bit","a0","about","above","abstract","across","additional","after",
- "afterwards","again","against","align","all","almost","alone","along",
- "already","also","although","always","am","among","amongst","amoungst",
- "amount","an","and","another","any","anybody","anyhow","anyone","anything",
- "anyway","anywhere","are","arial","around","as","ascii","assert","at",
- "back","background","base64","bcc","be","became","because","become","becomes",
- "becoming","been","before","beforehand","behind","being","below","beside",
- "besides","between","beyond","bgcolor","blank","blockquote","body","boolean",
- "border","both","br","break","but","by","can","cannot","cant","case","catch",
- "cc","cellpadding","cellspacing","center","char","charset","cheers","class",
- "co","color","colspan","com","con","const","continue","could","couldnt",
- "cry","css","de","dear","default","did","didnt","different","div","do",
- "does","doesnt","done","dont","double","down","due","during","each","eg",
- "eight","either","else","elsewhere","empty","encoding","enough","enum",
- "etc","eu","even","ever","every","everyone","everything","everywhere",
- "except","extends","face","family","few","ffffff","final","finally","float",
- "font","for","former","formerly","fri","from","further","get","give","go",
- "good","got","goto","gt","h1","ha","had","has","hasnt","have","he","head",
- "height","hello","helvetica","hence","her","here","hereafter","hereby",
- "herein","hereupon","hers","herself","hi","him","himself","his","how",
- "however","hr","href","html","http","https","id","ie","if","ill","im",
- "image","img","implements","import","in","inc","instanceof","int","interface",
- "into","is","isnt","iso-8859-1","it","its","itself","ive","just","keep",
- "last","latter","latterly","least","left","less","li","like","long","look",
- "lt","ltd","mail","mailto","many","margin","may","me","meanwhile","message",
- "meta","might","mill","mine","mon","more","moreover","most","mostly","mshtml",
- "mso","much","must","my","myself","name","namely","native","nbsp","need",
- "neither","never","nevertheless","new","next","nine","no","nobody","none",
- "noone","nor","not","nothing","now","nowhere","null","of","off","often",
- "ok","on","once","only","onto","or","org","other","others","otherwise",
- "our","ours","ourselves","out","over","own","package","pad","per","perhaps",
- "plain","please","pm","printable","private","protected","public","put",
- "quot","quote","r1","r2","rather","re","really","regards","reply","return",
- "right","said","same","sans","sat","say","saying","see","seem","seemed",
- "seeming","seems","serif","serious","several","she","short","should","show",
- "side","since","sincere","six","sixty","size","so","solid","some","somehow",
- "someone","something","sometime","sometimes","somewhere","span","src",
- "static","still","strictfp","string","strong","style","stylesheet","subject",
- "such","sun","super","sure","switch","synchronized","table","take","target",
- "td","text","th","than","thanks","that","the","their","them","themselves",
- "then","thence","there","thereafter","thereby","therefore","therein","thereupon",
- "these","they","thick","thin","think","third","this","those","though",
- "three","through","throughout","throw","throws","thru","thu","thus","tm",
- "to","together","too","top","toward","towards","tr","transfer","transient",
- "try","tue","type","ul","un","under","unsubscribe","until","up","upon",
- "us","use","used","uses","using","valign","verdana","very","via","void",
- "volatile","want","was","we","wed","weight","well","were","what","whatever",
- "when","whence","whenever","where","whereafter","whereas","whereby","wherein",
- "whereupon","wherever","whether","which","while","whither","who","whoever",
- "whole","whom","whose","why","width","will","with","within","without",
- "wont","would","wrote","www","yes","yet","you","your","yours","yourself",
- "yourselves"
- ), false);
-
- // Regex used to exclude non-alpha-numeric tokens
- private static final Pattern ALPHA_NUMERIC = Pattern.compile("^[a-z][a-z0-9_]+$");
- private static final Matcher MATCHER = ALPHA_NUMERIC.matcher("");
-
- public MailArchivesClusteringAnalyzer() {
- super(STOP_SET);
- }
-
- public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
- super(stopSet);
- }
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new StandardTokenizer();
- TokenStream result = new StandardFilter(tokenizer);
- result = new LowerCaseFilter(result);
- result = new ASCIIFoldingFilter(result);
- result = new AlphaNumericMaxLengthFilter(result);
- result = new StopFilter(result, STOP_SET);
- result = new PorterStemFilter(result);
- return new TokenStreamComponents(tokenizer, result);
- }
-
- /**
- * Matches alpha-numeric tokens between 2 and 40 chars long.
- */
- static class AlphaNumericMaxLengthFilter extends TokenFilter {
- private final CharTermAttribute termAtt;
- private final char[] output = new char[28];
-
- AlphaNumericMaxLengthFilter(TokenStream in) {
- super(in);
- termAtt = addAttribute(CharTermAttribute.class);
- }
-
- @Override
- public final boolean incrementToken() throws IOException {
- // return the first alpha-numeric token between 2 and 40 length
- while (input.incrementToken()) {
- int length = termAtt.length();
- if (length >= 2 && length <= 28) {
- char[] buf = termAtt.buffer();
- int at = 0;
- for (int c = 0; c < length; c++) {
- char ch = buf[c];
- if (ch != '\'') {
- output[at++] = ch;
- }
- }
- String term = new String(output, 0, at);
- MATCHER.reset(term);
- if (MATCHER.matches() && !term.startsWith("a0")) {
- termAtt.setEmpty();
- termAtt.append(term);
- return true;
- }
- }
- }
- return false;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java b/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java
deleted file mode 100644
index 44df006..0000000
--- a/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.IOException;
-
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
-import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
-
-/**
- *
- * Used in combining a large number of text files into one text input reader
- * along with the WholeFileRecordReader class.
- *
- */
-public class MultipleTextFileInputFormat extends CombineFileInputFormat<IntWritable, BytesWritable> {
-
- @Override
- public RecordReader<IntWritable, BytesWritable> createRecordReader(InputSplit inputSplit,
- TaskAttemptContext taskAttemptContext)
- throws IOException {
- return new CombineFileRecordReader<>((CombineFileSplit) inputSplit,
- taskAttemptContext, WholeFileRecordReader.class);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java b/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
deleted file mode 100644
index 37ebc44..0000000
--- a/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.utils.io.ChunkedWriter;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Map;
-
-/**
- * Default parser for parsing text into sequence files.
- */
-public final class PrefixAdditionFilter extends SequenceFilesFromDirectoryFilter {
-
- public PrefixAdditionFilter(Configuration conf,
- String keyPrefix,
- Map<String, String> options,
- ChunkedWriter writer,
- Charset charset,
- FileSystem fs) {
- super(conf, keyPrefix, options, writer, charset, fs);
- }
-
- @Override
- protected void process(FileStatus fst, Path current) throws IOException {
- FileSystem fs = getFs();
- ChunkedWriter writer = getWriter();
- if (fst.isDir()) {
- String dirPath = getPrefix() + Path.SEPARATOR + current.getName() + Path.SEPARATOR + fst.getPath().getName();
- fs.listStatus(fst.getPath(),
- new PrefixAdditionFilter(getConf(), dirPath, getOptions(), writer, getCharset(), fs));
- } else {
- try (InputStream in = fs.open(fst.getPath())){
- StringBuilder file = new StringBuilder();
- for (String aFit : new FileLineIterable(in, getCharset(), false)) {
- file.append(aFit).append('\n');
- }
- String name = current.getName().equals(fst.getPath().getName())
- ? current.getName()
- : current.getName() + Path.SEPARATOR + fst.getPath().getName();
- writer.write(getPrefix() + Path.SEPARATOR + name, file.toString());
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
deleted file mode 100644
index 311ab8d..0000000
--- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.utils.io.ChunkedWriter;
-
-/**
- * Converts a directory of text documents into SequenceFiles of Specified chunkSize. This class takes in a
- * parent directory containing sub folders of text documents and recursively reads the files and creates the
- * {@link org.apache.hadoop.io.SequenceFile}s of docid => content. The docid is set as the relative path of the
- * document from the parent directory prepended with a specified prefix. You can also specify the input encoding
- * of the text files. The content of the output SequenceFiles are encoded as UTF-8 text.
- */
-public class SequenceFilesFromDirectory extends AbstractJob {
-
- private static final String PREFIX_ADDITION_FILTER = PrefixAdditionFilter.class.getName();
-
- private static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"};
- public static final String[] FILE_FILTER_CLASS_OPTION = {"fileFilterClass", "filter"};
- private static final String[] CHARSET_OPTION = {"charset", "c"};
-
- private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000;
-
- public static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"};
- public static final String BASE_INPUT_PATH = "baseinputpath";
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new SequenceFilesFromDirectory(), args);
- }
-
- /*
- * callback main after processing MapReduce parameters
- */
- @Override
- public int run(String[] args) throws Exception {
- addOptions();
- addOption(DefaultOptionCreator.methodOption().create());
- addOption(DefaultOptionCreator.overwriteOption().create());
-
- if (parseArguments(args) == null) {
- return -1;
- }
-
- Map<String, String> options = parseOptions();
- Path output = getOutputPath();
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), output);
- }
-
- if (getOption(DefaultOptionCreator.METHOD_OPTION,
- DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
- runSequential(getConf(), getInputPath(), output, options);
- } else {
- runMapReduce(getInputPath(), output);
- }
-
- return 0;
- }
-
- private int runSequential(Configuration conf, Path input, Path output, Map<String, String> options)
- throws IOException, InterruptedException, NoSuchMethodException {
- // Running sequentially
- Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
- String keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
- FileSystem fs = FileSystem.get(input.toUri(), conf);
-
- try (ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output)) {
- SequenceFilesFromDirectoryFilter pathFilter;
- String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
- if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
- pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, charset, fs);
- } else {
- pathFilter = ClassUtils.instantiateAs(fileFilterClassName, SequenceFilesFromDirectoryFilter.class,
- new Class[] {Configuration.class, String.class, Map.class, ChunkedWriter.class, Charset.class, FileSystem.class},
- new Object[] {conf, keyPrefix, options, writer, charset, fs});
- }
- fs.listStatus(input, pathFilter);
- }
- return 0;
- }
-
- private int runMapReduce(Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException {
-
- int chunkSizeInMB = 64;
- if (hasOption(CHUNK_SIZE_OPTION[0])) {
- chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
- }
-
- String keyPrefix = null;
- if (hasOption(KEY_PREFIX_OPTION[0])) {
- keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
- }
-
- String fileFilterClassName = null;
- if (hasOption(FILE_FILTER_CLASS_OPTION[0])) {
- fileFilterClassName = getOption(FILE_FILTER_CLASS_OPTION[0]);
- }
-
- PathFilter pathFilter = null;
- // Prefix Addition is presently handled in the Mapper and unlike runsequential()
- // need not be done via a pathFilter
- if (!StringUtils.isBlank(fileFilterClassName) && !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
- try {
- pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance();
- } catch (InstantiationException | IllegalAccessException e) {
- throw new IllegalStateException(e);
- }
- }
-
- // Prepare Job for submission.
- Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
- SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
- SequenceFileOutputFormat.class, "SequenceFilesFromDirectory");
-
- Configuration jobConfig = job.getConfiguration();
- jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix);
- jobConfig.set(FILE_FILTER_CLASS_OPTION[0], fileFilterClassName);
-
- FileSystem fs = FileSystem.get(jobConfig);
- FileStatus fsFileStatus = fs.getFileStatus(input);
-
- String inputDirList;
- if (pathFilter != null) {
- inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus, pathFilter);
- } else {
- inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
- }
-
- jobConfig.set(BASE_INPUT_PATH, input.toString());
-
- long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
-
- // set the max split locations, otherwise we get nasty debug stuff
- jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));
-
- FileInputFormat.setInputPaths(job, inputDirList);
- // need to set this to a multiple of the block size, or no split happens
- FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);
- FileOutputFormat.setCompressOutput(job, true);
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- return 0;
- }
-
- /**
- * Override this method in order to add additional options to the command line of the SequenceFileFromDirectory job.
- * Do not forget to call super() otherwise all standard options (input/output dirs etc) will not be available.
- */
- protected void addOptions() {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.overwriteOption().create());
- addOption(DefaultOptionCreator.methodOption().create());
- addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
- addOption(FILE_FILTER_CLASS_OPTION[0], FILE_FILTER_CLASS_OPTION[1],
- "The name of the class to use for file parsing. Default: " + PREFIX_ADDITION_FILTER, PREFIX_ADDITION_FILTER);
- addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
- addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
- "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
- }
-
- /**
- * Override this method in order to parse your additional options from the command line. Do not forget to call
- * super() otherwise standard options (input/output dirs etc) will not be available.
- *
- * @return Map of options
- */
- protected Map<String, String> parseOptions() {
- Map<String, String> options = new HashMap<>();
- options.put(CHUNK_SIZE_OPTION[0], getOption(CHUNK_SIZE_OPTION[0]));
- options.put(FILE_FILTER_CLASS_OPTION[0], getOption(FILE_FILTER_CLASS_OPTION[0]));
- options.put(CHARSET_OPTION[0], getOption(CHARSET_OPTION[0]));
- return options;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
deleted file mode 100644
index 6e4bd64..0000000
--- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.mahout.utils.io.ChunkedWriter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.Map;
-
-/**
- * Implement this interface if you wish to extend SequenceFilesFromDirectory with your own parsing logic.
- */
-public abstract class SequenceFilesFromDirectoryFilter implements PathFilter {
- private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromDirectoryFilter.class);
-
- private final String prefix;
- private final ChunkedWriter writer;
- private final Charset charset;
- private final FileSystem fs;
- private final Map<String, String> options;
- private final Configuration conf;
-
- protected SequenceFilesFromDirectoryFilter(Configuration conf,
- String keyPrefix,
- Map<String, String> options,
- ChunkedWriter writer,
- Charset charset,
- FileSystem fs) {
- this.prefix = keyPrefix;
- this.writer = writer;
- this.charset = charset;
- this.fs = fs;
- this.options = options;
- this.conf = conf;
- }
-
- protected final String getPrefix() {
- return prefix;
- }
-
- protected final ChunkedWriter getWriter() {
- return writer;
- }
-
- protected final Charset getCharset() {
- return charset;
- }
-
- protected final FileSystem getFs() {
- return fs;
- }
-
- protected final Map<String, String> getOptions() {
- return options;
- }
-
- protected final Configuration getConf() {
- return conf;
- }
-
- @Override
- public final boolean accept(Path current) {
- log.debug("CURRENT: {}", current.getName());
- try {
- for (FileStatus fst : fs.listStatus(current)) {
- log.debug("CHILD: {}", fst.getPath().getName());
- process(fst, current);
- }
- } catch (IOException ioe) {
- throw new IllegalStateException(ioe);
- }
- return false;
- }
-
- protected abstract void process(FileStatus in, Path current) throws IOException;
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java
deleted file mode 100644
index 40df3c2..0000000
--- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
-import org.apache.mahout.common.HadoopUtil;
-
-import static org.apache.mahout.text.SequenceFilesFromDirectory.KEY_PREFIX_OPTION;
-
-/**
- * Map class for SequenceFilesFromDirectory MR job
- */
-public class SequenceFilesFromDirectoryMapper extends Mapper<IntWritable, BytesWritable, Text, Text> {
-
- private String keyPrefix;
- private Text fileValue = new Text();
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- this.keyPrefix = context.getConfiguration().get(KEY_PREFIX_OPTION[0], "");
- }
-
- public void map(IntWritable key, BytesWritable value, Context context)
- throws IOException, InterruptedException {
-
- Configuration configuration = context.getConfiguration();
- Path filePath = ((CombineFileSplit) context.getInputSplit()).getPath(key.get());
- String relativeFilePath = HadoopUtil.calcRelativeFilePath(configuration, filePath);
-
- String filename = this.keyPrefix.length() > 0 ?
- this.keyPrefix + Path.SEPARATOR + relativeFilePath :
- Path.SEPARATOR + relativeFilePath;
-
- fileValue.set(value.getBytes(), 0, value.getBytes().length);
- context.write(new Text(filename), fileValue);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
deleted file mode 100644
index c17cc12..0000000
--- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
+++ /dev/null
@@ -1,369 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import org.apache.commons.io.DirectoryWalker;
-import org.apache.commons.io.comparator.CompositeFileComparator;
-import org.apache.commons.io.comparator.DirectoryFileComparator;
-import org.apache.commons.io.comparator.PathFileComparator;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.utils.email.MailOptions;
-import org.apache.mahout.utils.email.MailProcessor;
-import org.apache.mahout.utils.io.ChunkedWriter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Deque;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Converts a directory of gzipped mail archives into SequenceFiles of specified
- * chunkSize. This class is similar to {@link SequenceFilesFromDirectory} except
- * it uses block-compressed {@link org.apache.hadoop.io.SequenceFile}s and parses out the subject and
- * body text of each mail message into a separate key/value pair.
- */
-public final class SequenceFilesFromMailArchives extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromMailArchives.class);
-
- public static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"};
- public static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"};
- public static final String[] CHARSET_OPTION = {"charset", "c"};
- public static final String[] SUBJECT_OPTION = {"subject", "s"};
- public static final String[] TO_OPTION = {"to", "to"};
- public static final String[] FROM_OPTION = {"from", "from"};
- public static final String[] REFERENCES_OPTION = {"references", "refs"};
- public static final String[] BODY_OPTION = {"body", "b"};
- public static final String[] STRIP_QUOTED_OPTION = {"stripQuoted", "q"};
- public static final String[] QUOTED_REGEX_OPTION = {"quotedRegex", "regex"};
- public static final String[] SEPARATOR_OPTION = {"separator", "sep"};
- public static final String[] BODY_SEPARATOR_OPTION = {"bodySeparator", "bodySep"};
- public static final String BASE_INPUT_PATH = "baseinputpath";
-
- private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000;
-
- public void createSequenceFiles(MailOptions options) throws IOException {
- try (ChunkedWriter writer =
- new ChunkedWriter(getConf(), options.getChunkSize(), new Path(options.getOutputDir()))){
- MailProcessor processor = new MailProcessor(options, options.getPrefix(), writer);
- if (options.getInput().isDirectory()) {
- PrefixAdditionDirectoryWalker walker = new PrefixAdditionDirectoryWalker(processor, writer);
- walker.walk(options.getInput());
- log.info("Parsed {} messages from {}", walker.getMessageCount(), options.getInput().getAbsolutePath());
- } else {
- long start = System.currentTimeMillis();
- long cnt = processor.parseMboxLineByLine(options.getInput());
- long finish = System.currentTimeMillis();
- log.info("Parsed {} messages from {} in time: {}", cnt, options.getInput().getAbsolutePath(), finish - start);
- }
- }
- }
-
- private static class PrefixAdditionDirectoryWalker extends DirectoryWalker<Object> {
-
- @SuppressWarnings("unchecked")
- private static final Comparator<File> FILE_COMPARATOR = new CompositeFileComparator(
- DirectoryFileComparator.DIRECTORY_REVERSE, PathFileComparator.PATH_COMPARATOR);
-
- private final Deque<MailProcessor> processors = new ArrayDeque<>();
- private final ChunkedWriter writer;
- private final Deque<Long> messageCounts = new ArrayDeque<>();
-
- public PrefixAdditionDirectoryWalker(MailProcessor processor, ChunkedWriter writer) {
- processors.addFirst(processor);
- this.writer = writer;
- messageCounts.addFirst(0L);
- }
-
- public void walk(File startDirectory) throws IOException {
- super.walk(startDirectory, null);
- }
-
- public long getMessageCount() {
- return messageCounts.getFirst();
- }
-
- @Override
- protected void handleDirectoryStart(File current, int depth, Collection<Object> results) throws IOException {
- if (depth > 0) {
- log.info("At {}", current.getAbsolutePath());
- MailProcessor processor = processors.getFirst();
- MailProcessor subDirProcessor = new MailProcessor(processor.getOptions(), processor.getPrefix()
- + File.separator + current.getName(), writer);
- processors.push(subDirProcessor);
- messageCounts.push(0L);
- }
- }
-
- @Override
- protected File[] filterDirectoryContents(File directory, int depth, File[] files) throws IOException {
- Arrays.sort(files, FILE_COMPARATOR);
- return files;
- }
-
- @Override
- protected void handleFile(File current, int depth, Collection<Object> results) throws IOException {
- MailProcessor processor = processors.getFirst();
- long currentDirMessageCount = messageCounts.pop();
- try {
- currentDirMessageCount += processor.parseMboxLineByLine(current);
- } catch (IOException e) {
- throw new IllegalStateException("Error processing " + current, e);
- }
- messageCounts.push(currentDirMessageCount);
- }
-
- @Override
- protected void handleDirectoryEnd(File current, int depth, Collection<Object> results) throws IOException {
- if (depth > 0) {
- final long currentDirMessageCount = messageCounts.pop();
- log.info("Parsed {} messages from directory {}", currentDirMessageCount, current.getAbsolutePath());
-
- processors.pop();
-
- // aggregate message counts
- long parentDirMessageCount = messageCounts.pop();
- parentDirMessageCount += currentDirMessageCount;
- messageCounts.push(parentDirMessageCount);
- }
- }
- }
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new Configuration(), new SequenceFilesFromMailArchives(), args);
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.methodOption().create());
-
- addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
- addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
- addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
- "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
- addFlag(SUBJECT_OPTION[0], SUBJECT_OPTION[1], "Include the Mail subject as part of the text. Default is false");
- addFlag(TO_OPTION[0], TO_OPTION[1], "Include the to field in the text. Default is false");
- addFlag(FROM_OPTION[0], FROM_OPTION[1], "Include the from field in the text. Default is false");
- addFlag(REFERENCES_OPTION[0], REFERENCES_OPTION[1],
- "Include the references field in the text. Default is false");
- addFlag(BODY_OPTION[0], BODY_OPTION[1], "Include the body in the output. Default is false");
- addFlag(STRIP_QUOTED_OPTION[0], STRIP_QUOTED_OPTION[1],
- "Strip (remove) quoted email text in the body. Default is false");
- addOption(QUOTED_REGEX_OPTION[0], QUOTED_REGEX_OPTION[1],
- "Specify the regex that identifies quoted text. "
- + "Default is to look for > or | at the beginning of the line.");
- addOption(SEPARATOR_OPTION[0], SEPARATOR_OPTION[1],
- "The separator to use between metadata items (to, from, etc.). Default is \\n", "\n");
- addOption(BODY_SEPARATOR_OPTION[0], BODY_SEPARATOR_OPTION[1],
- "The separator to use between lines in the body. Default is \\n. "
- + "Useful to change if you wish to have the message be on one line", "\n");
-
- addOption(DefaultOptionCreator.helpOption());
- Map<String, List<String>> parsedArgs = parseArguments(args);
- if (parsedArgs == null) {
- return -1;
- }
- File input = getInputFile();
- String outputDir = getOutputPath().toString();
-
- int chunkSize = 64;
- if (hasOption(CHUNK_SIZE_OPTION[0])) {
- chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
- }
-
- String prefix = "";
- if (hasOption(KEY_PREFIX_OPTION[0])) {
- prefix = getOption(KEY_PREFIX_OPTION[0]);
- }
-
- Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
- MailOptions options = new MailOptions();
- options.setInput(input);
- options.setOutputDir(outputDir);
- options.setPrefix(prefix);
- options.setChunkSize(chunkSize);
- options.setCharset(charset);
-
- List<Pattern> patterns = new ArrayList<>(5);
- // patternOrder is used downstream so that we can know what order the text
- // is in instead of encoding it in the string, which
- // would require more processing later to remove it pre feature selection.
- Map<String, Integer> patternOrder = new HashMap<>();
- int order = 0;
- if (hasOption(FROM_OPTION[0])) {
- patterns.add(MailProcessor.FROM_PREFIX);
- patternOrder.put(MailOptions.FROM, order++);
- }
- if (hasOption(TO_OPTION[0])) {
- patterns.add(MailProcessor.TO_PREFIX);
- patternOrder.put(MailOptions.TO, order++);
- }
- if (hasOption(REFERENCES_OPTION[0])) {
- patterns.add(MailProcessor.REFS_PREFIX);
- patternOrder.put(MailOptions.REFS, order++);
- }
- if (hasOption(SUBJECT_OPTION[0])) {
- patterns.add(MailProcessor.SUBJECT_PREFIX);
- patternOrder.put(MailOptions.SUBJECT, order += 1);
- }
- options.setStripQuotedText(hasOption(STRIP_QUOTED_OPTION[0]));
-
- options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()]));
- options.setPatternOrder(patternOrder);
- options.setIncludeBody(hasOption(BODY_OPTION[0]));
-
- if (hasOption(SEPARATOR_OPTION[0])) {
- options.setSeparator(getOption(SEPARATOR_OPTION[0]));
- } else {
- options.setSeparator("\n");
- }
-
- if (hasOption(BODY_SEPARATOR_OPTION[0])) {
- options.setBodySeparator(getOption(BODY_SEPARATOR_OPTION[0]));
- }
-
- if (hasOption(QUOTED_REGEX_OPTION[0])) {
- options.setQuotedTextPattern(Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])));
- }
-
- if (getOption(DefaultOptionCreator.METHOD_OPTION,
- DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
- runSequential(options);
- } else {
- runMapReduce(getInputPath(), getOutputPath());
- }
-
- return 0;
- }
-
- private int runSequential(MailOptions options)
- throws IOException, InterruptedException, NoSuchMethodException {
-
- long start = System.currentTimeMillis();
- createSequenceFiles(options);
- long finish = System.currentTimeMillis();
- log.info("Conversion took {}ms", finish - start);
-
- return 0;
- }
-
- private int runMapReduce(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException {
-
- Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromMailArchivesMapper.class,
- Text.class, Text.class, SequenceFileOutputFormat.class, "SequentialFilesFromMailArchives");
-
- Configuration jobConfig = job.getConfiguration();
-
- if (hasOption(KEY_PREFIX_OPTION[0])) {
- jobConfig.set(KEY_PREFIX_OPTION[1], getOption(KEY_PREFIX_OPTION[0]));
- }
-
- int chunkSize = 0;
- if (hasOption(CHUNK_SIZE_OPTION[0])) {
- chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
- jobConfig.set(CHUNK_SIZE_OPTION[0], String.valueOf(chunkSize));
- }
-
- Charset charset;
- if (hasOption(CHARSET_OPTION[0])) {
- charset = Charset.forName(getOption(CHARSET_OPTION[0]));
- jobConfig.set(CHARSET_OPTION[0], charset.displayName());
- }
-
- if (hasOption(FROM_OPTION[0])) {
- jobConfig.set(FROM_OPTION[1], "true");
- }
-
- if (hasOption(TO_OPTION[0])) {
- jobConfig.set(TO_OPTION[1], "true");
- }
-
- if (hasOption(REFERENCES_OPTION[0])) {
- jobConfig.set(REFERENCES_OPTION[1], "true");
- }
-
- if (hasOption(SUBJECT_OPTION[0])) {
- jobConfig.set(SUBJECT_OPTION[1], "true");
- }
-
- if (hasOption(QUOTED_REGEX_OPTION[0])) {
- jobConfig.set(QUOTED_REGEX_OPTION[1], Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])).toString());
- }
-
- if (hasOption(SEPARATOR_OPTION[0])) {
- jobConfig.set(SEPARATOR_OPTION[1], getOption(SEPARATOR_OPTION[0]));
- } else {
- jobConfig.set(SEPARATOR_OPTION[1], "\n");
- }
-
- if (hasOption(BODY_OPTION[0])) {
- jobConfig.set(BODY_OPTION[1], "true");
- } else {
- jobConfig.set(BODY_OPTION[1], "false");
- }
-
- if (hasOption(BODY_SEPARATOR_OPTION[0])) {
- jobConfig.set(BODY_SEPARATOR_OPTION[1], getOption(BODY_SEPARATOR_OPTION[0]));
- } else {
- jobConfig.set(BODY_SEPARATOR_OPTION[1], "\n");
- }
-
- FileSystem fs = FileSystem.get(jobConfig);
- FileStatus fsFileStatus = fs.getFileStatus(inputPath);
-
- jobConfig.set(BASE_INPUT_PATH, inputPath.toString());
- String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
- FileInputFormat.setInputPaths(job, inputDirList);
-
- long chunkSizeInBytes = chunkSize * 1024 * 1024;
- // need to set this to a multiple of the block size, or no split happens
- FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);
-
- // set the max split locations, otherwise we get nasty debug stuff
- jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- return 0;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java
deleted file mode 100644
index 203e8fb..0000000
--- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java
+++ /dev/null
@@ -1,244 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.utils.email.MailOptions;
-import org.apache.mahout.utils.email.MailProcessor;
-
-import java.io.ByteArrayInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_SEPARATOR_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.CHARSET_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.CHUNK_SIZE_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.FROM_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.KEY_PREFIX_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.QUOTED_REGEX_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.REFERENCES_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.SEPARATOR_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.STRIP_QUOTED_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.SUBJECT_OPTION;
-import static org.apache.mahout.text.SequenceFilesFromMailArchives.TO_OPTION;
-
-/**
- * Map Class for the SequenceFilesFromMailArchives job
- */
-public class SequenceFilesFromMailArchivesMapper extends Mapper<IntWritable, BytesWritable, Text, Text> {
-
- private Text outKey = new Text();
- private Text outValue = new Text();
-
- private static final Pattern MESSAGE_START = Pattern.compile(
- "^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
- private static final Pattern MESSAGE_ID_PREFIX = Pattern.compile(
- "^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);
-
- private MailOptions options;
-
- @Override
- public void setup(Context context) throws IOException, InterruptedException {
-
- Configuration configuration = context.getConfiguration();
-
- // absorb all of the options into the MailOptions object
- this.options = new MailOptions();
-
- options.setPrefix(configuration.get(KEY_PREFIX_OPTION[1], ""));
-
- if (!configuration.get(CHUNK_SIZE_OPTION[0], "").equals("")) {
- options.setChunkSize(configuration.getInt(CHUNK_SIZE_OPTION[0], 64));
- }
-
- if (!configuration.get(CHARSET_OPTION[0], "").equals("")) {
- Charset charset = Charset.forName(configuration.get(CHARSET_OPTION[0], "UTF-8"));
- options.setCharset(charset);
- } else {
- Charset charset = Charset.forName("UTF-8");
- options.setCharset(charset);
- }
-
- List<Pattern> patterns = Lists.newArrayListWithCapacity(5);
- // patternOrder is used downstream so that we can know what order the
- // text is in instead
- // of encoding it in the string, which
- // would require more processing later to remove it pre feature
- // selection.
- Map<String, Integer> patternOrder = Maps.newHashMap();
- int order = 0;
- if (!configuration.get(FROM_OPTION[1], "").equals("")) {
- patterns.add(MailProcessor.FROM_PREFIX);
- patternOrder.put(MailOptions.FROM, order++);
- }
-
- if (!configuration.get(TO_OPTION[1], "").equals("")) {
- patterns.add(MailProcessor.TO_PREFIX);
- patternOrder.put(MailOptions.TO, order++);
- }
-
- if (!configuration.get(REFERENCES_OPTION[1], "").equals("")) {
- patterns.add(MailProcessor.REFS_PREFIX);
- patternOrder.put(MailOptions.REFS, order++);
- }
-
- if (!configuration.get(SUBJECT_OPTION[1], "").equals("")) {
- patterns.add(MailProcessor.SUBJECT_PREFIX);
- patternOrder.put(MailOptions.SUBJECT, order += 1);
- }
-
- options.setStripQuotedText(configuration.getBoolean(STRIP_QUOTED_OPTION[1], false));
-
- options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()]));
- options.setPatternOrder(patternOrder);
-
- options.setIncludeBody(configuration.getBoolean(BODY_OPTION[1], false));
-
- options.setSeparator("\n");
- if (!configuration.get(SEPARATOR_OPTION[1], "").equals("")) {
- options.setSeparator(configuration.get(SEPARATOR_OPTION[1], ""));
- }
- if (!configuration.get(BODY_SEPARATOR_OPTION[1], "").equals("")) {
- options.setBodySeparator(configuration.get(BODY_SEPARATOR_OPTION[1], ""));
- }
- if (!configuration.get(QUOTED_REGEX_OPTION[1], "").equals("")) {
- options.setQuotedTextPattern(Pattern.compile(configuration.get(QUOTED_REGEX_OPTION[1], "")));
- }
-
- }
-
- public long parseMailboxLineByLine(String filename, InputStream mailBoxInputStream, Context context)
- throws IOException, InterruptedException {
- long messageCount = 0;
- try {
- StringBuilder contents = new StringBuilder();
- StringBuilder body = new StringBuilder();
- Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
- Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
- String[] patternResults = new String[options.getPatternsToMatch().length];
- Matcher[] matches = new Matcher[options.getPatternsToMatch().length];
- for (int i = 0; i < matches.length; i++) {
- matches[i] = options.getPatternsToMatch()[i].matcher("");
- }
-
- String messageId = null;
- boolean inBody = false;
- Pattern quotedTextPattern = options.getQuotedTextPattern();
-
- for (String nextLine : new FileLineIterable(mailBoxInputStream, options.getCharset(), false, filename)) {
- if (!options.isStripQuotedText() || !quotedTextPattern.matcher(nextLine).find()) {
- for (int i = 0; i < matches.length; i++) {
- Matcher matcher = matches[i];
- matcher.reset(nextLine);
- if (matcher.matches()) {
- patternResults[i] = matcher.group(1);
- }
- }
-
- // only start appending body content after we've seen a message ID
- if (messageId != null) {
- // first, see if we hit the end of the message
- messageBoundaryMatcher.reset(nextLine);
- if (messageBoundaryMatcher.matches()) {
- // done parsing this message ... write it out
- String key = generateKey(filename, options.getPrefix(), messageId);
- // if this ordering changes, then also change
- // FromEmailToDictionaryMapper
- writeContent(options.getSeparator(), contents, body, patternResults);
-
- this.outKey.set(key);
- this.outValue.set(contents.toString());
- context.write(this.outKey, this.outValue);
- contents.setLength(0); // reset the buffer
- body.setLength(0);
- messageId = null;
- inBody = false;
- } else {
- if (inBody && options.isIncludeBody()) {
- if (!nextLine.isEmpty()) {
- body.append(nextLine).append(options.getBodySeparator());
- }
- } else {
- // first empty line we see after reading the message Id
- // indicates that we are in the body ...
- inBody = nextLine.isEmpty();
- }
- }
- } else {
- if (nextLine.length() > 14) {
- messageIdMatcher.reset(nextLine);
- if (messageIdMatcher.matches()) {
- messageId = messageIdMatcher.group(1);
- ++messageCount;
- }
- }
- }
- }
- }
- // write the last message in the file if available
- if (messageId != null) {
- String key = generateKey(filename, options.getPrefix(), messageId);
- writeContent(options.getSeparator(), contents, body, patternResults);
- this.outKey.set(key);
- this.outValue.set(contents.toString());
- context.write(this.outKey, this.outValue);
- contents.setLength(0); // reset the buffer
- }
- } catch (FileNotFoundException ignored) {
-
- }
- return messageCount;
- }
-
- protected static String generateKey(String mboxFilename, String prefix, String messageId) {
- return Joiner.on(Path.SEPARATOR).join(Lists.newArrayList(prefix, mboxFilename, messageId).iterator());
- }
-
- private static void writeContent(String separator, StringBuilder contents, CharSequence body, String[] matches) {
- String matchesString = Joiner.on(separator).useForNull("").join(Arrays.asList(matches).iterator());
- contents.append(matchesString).append(separator).append(body);
- }
-
- public void map(IntWritable key, BytesWritable value, Context context)
- throws IOException, InterruptedException {
- Configuration configuration = context.getConfiguration();
- Path filePath = ((CombineFileSplit) context.getInputSplit()).getPath(key.get());
- String relativeFilePath = HadoopUtil.calcRelativeFilePath(configuration, filePath);
- ByteArrayInputStream is = new ByteArrayInputStream(value.getBytes());
- parseMailboxLineByLine(relativeFilePath, is, context);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java b/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java
deleted file mode 100644
index cacfd22..0000000
--- a/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-
-import java.io.IOException;
-
-public class TextParagraphSplittingJob extends AbstractJob {
-
- @Override
- public int run(String[] strings) throws Exception {
- Configuration originalConf = getConf();
- Job job = prepareJob(new Path(originalConf.get("mapred.input.dir")),
- new Path(originalConf.get("mapred.output.dir")),
- SequenceFileInputFormat.class,
- SplitMap.class,
- Text.class,
- Text.class,
- Reducer.class,
- Text.class,
- Text.class,
- SequenceFileOutputFormat.class);
- job.setNumReduceTasks(0);
- boolean succeeded = job.waitForCompletion(true);
- return succeeded ? 0 : -1;
- }
-
- public static class SplitMap extends Mapper<Text,Text,Text,Text> {
-
- @Override
- protected void map(Text key, Text text, Context context) throws IOException, InterruptedException {
- Text outText = new Text();
- int loc = 0;
- while (loc >= 0 && loc < text.getLength()) {
- int nextLoc = text.find("\n\n", loc + 1);
- if (nextLoc > 0) {
- outText.set(text.getBytes(), loc, nextLoc - loc);
- context.write(key, outText);
- }
- loc = nextLoc;
- }
- }
- }
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new TextParagraphSplittingJob(), args);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java b/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java
deleted file mode 100644
index b8441b7..0000000
--- a/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.IOException;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.IOUtils;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-
-import static org.apache.mahout.text.SequenceFilesFromDirectory.FILE_FILTER_CLASS_OPTION;
-
-/**
- * RecordReader used with the MultipleTextFileInputFormat class to read full files as
- * k/v pairs and groups of files as single input splits.
- */
-public class WholeFileRecordReader extends RecordReader<IntWritable, BytesWritable> {
-
- private FileSplit fileSplit;
- private boolean processed = false;
- private Configuration configuration;
- private BytesWritable value = new BytesWritable();
- private IntWritable index;
- private String fileFilterClassName = null;
- private PathFilter pathFilter = null;
-
- public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext taskAttemptContext, Integer idx)
- throws IOException {
- this.fileSplit = new FileSplit(fileSplit.getPath(idx), fileSplit.getOffset(idx),
- fileSplit.getLength(idx), fileSplit.getLocations());
- this.configuration = taskAttemptContext.getConfiguration();
- this.index = new IntWritable(idx);
- this.fileFilterClassName = this.configuration.get(FILE_FILTER_CLASS_OPTION[0]);
- }
-
- @Override
- public IntWritable getCurrentKey() {
- return index;
- }
-
- @Override
- public BytesWritable getCurrentValue() {
- return value;
- }
-
- @Override
- public float getProgress() throws IOException {
- return processed ? 1.0f : 0.0f;
- }
-
- @Override
- public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
- throws IOException, InterruptedException {
- if (!StringUtils.isBlank(fileFilterClassName) &&
- !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
- try {
- pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance();
- } catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) {
- throw new IllegalStateException(e);
- }
- }
- }
-
- @Override
- public boolean nextKeyValue() throws IOException {
- if (!processed) {
- byte[] contents = new byte[(int) fileSplit.getLength()];
- Path file = fileSplit.getPath();
- FileSystem fs = file.getFileSystem(this.configuration);
-
- if (!fs.isFile(file)) {
- return false;
- }
-
- FileStatus[] fileStatuses;
- if (pathFilter != null) {
- fileStatuses = fs.listStatus(file, pathFilter);
- } else {
- fileStatuses = fs.listStatus(file);
- }
-
- if (fileStatuses.length == 1) {
- try (FSDataInputStream in = fs.open(fileStatuses[0].getPath())) {
- IOUtils.readFully(in, contents, 0, contents.length);
- value.setCapacity(contents.length);
- value.set(contents, 0, contents.length);
- }
- processed = true;
- return true;
- }
- }
- return false;
- }
-
- @Override
- public void close() throws IOException {
- }
-}
\ No newline at end of file

r***@apache.org

2018-06-27 14:51:57 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/SplitInput.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/SplitInput.java b/integration/src/main/java/org/apache/mahout/utils/SplitInput.java
deleted file mode 100644
index 6178f80..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/SplitInput.java
+++ /dev/null
@@ -1,673 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.Charset;
-import java.util.BitSet;
-
-import com.google.common.base.Preconditions;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.apache.mahout.math.jet.random.sampling.RandomSampler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * A utility for splitting files in the input format used by the Bayes
- * classifiers or anything else that has one item per line or SequenceFiles (key/value)
- * into training and test sets in order to perform cross-validation.
- * 
- * 
- * This class can be used to split directories of files or individual files into
- * training and test sets using a number of different methods.
- * 
- * When executed via {@link #splitDirectory(Path)} or {@link #splitFile(Path)},
- * the lines read from one or more, input files are written to files of the same
- * name into the directories specified by the
- * {@link #setTestOutputDirectory(Path)} and
- * {@link #setTrainingOutputDirectory(Path)} methods.
- * 
- * The composition of the test set is determined using one of the following
- * approaches:
- * <ul>
- * <li>A contiguous set of items can be chosen from the input file(s) using the
- * {@link #setTestSplitSize(int)} or {@link #setTestSplitPct(int)} methods.
- * {@link #setTestSplitSize(int)} allocates a fixed number of items, while
- * {@link #setTestSplitPct(int)} allocates a percentage of the original input,
- * rounded up to the nearest integer. {@link #setSplitLocation(int)} is used to
- * control the position in the input from which the test data is extracted and
- * is described further below.</li>
- * <li>A random sampling of items can be chosen from the input files(s) using
- * the {@link #setTestRandomSelectionSize(int)} or
- * {@link #setTestRandomSelectionPct(int)} methods, each choosing a fixed test
- * set size or percentage of the input set size as described above. The
- * {@link RandomSampler} class from {@code mahout-math} is used to create a sample
- * of the appropriate size.</li>
- * </ul>
- * 
- * Any one of the methods above can be used to control the size of the test set.
- * If multiple methods are called, a runtime exception will be thrown at
- * execution time.
- * 
- * The {@link #setSplitLocation(int)} method is passed an integer from 0 to 100
- * (inclusive) which is translated into the position of the start of the test
- * data within the input file.
- * 
- * Given:
- * <ul>
- * <li>an input file of 1500 lines</li>
- * <li>a desired test data size of 10 percent</li>
- * </ul>
- * 
- * <ul>
- * <li>A split location of 0 will cause the first 150 items appearing in the
- * input set to be written to the test set.</li>
- * <li>A split location of 25 will cause items 375-525 to be written to the test
- * set.</li>
- * <li>A split location of 100 will cause the last 150 items in the input to be
- * written to the test set</li>
- * </ul>
- * The start of the split will always be adjusted forwards in order to ensure
- * that the desired test set size is allocated. Split location has no effect is
- * random sampling is employed.
- */
-public class SplitInput extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(SplitInput.class);
-
- private int testSplitSize = -1;
- private int testSplitPct = -1;
- private int splitLocation = 100;
- private int testRandomSelectionSize = -1;
- private int testRandomSelectionPct = -1;
- private int keepPct = 100;
- private Charset charset = Charsets.UTF_8;
- private boolean useSequence;
- private boolean useMapRed;
-
- private Path inputDirectory;
- private Path trainingOutputDirectory;
- private Path testOutputDirectory;
- private Path mapRedOutputDirectory;
-
- private SplitCallback callback;
-
- @Override
- public int run(String[] args) throws Exception {
-
- if (parseArgs(args)) {
- splitDirectory();
- }
- return 0;
- }
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new Configuration(), new SplitInput(), args);
- }
-
- /**
- * Configure this instance based on the command-line arguments contained within provided array.
- * Calls {@link #validate()} to ensure consistency of configuration.
- *
- * @return true if the arguments were parsed successfully and execution should proceed.
- * @throws Exception if there is a problem parsing the command-line arguments or the particular
- * combination would violate class invariants.
- */
- private boolean parseArgs(String[] args) throws Exception {
-
- addInputOption();
- addOption("trainingOutput", "tr", "The training data output directory", false);
- addOption("testOutput", "te", "The test data output directory", false);
- addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
- addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
- addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file "
- + "size (0=start, 50=middle, 100=end", false);
- addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
- addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using "
- + "mapreduce mode", false);
- addOption("charset", "c", "The name of the character encoding of the input files (not needed if using "
- + "SequenceFiles)", false);
- addOption(buildOption("sequenceFiles", "seq", "Set if the input files are sequence files. Default is false",
- false, false, "false"));
- addOption(DefaultOptionCreator.methodOption().create());
- addOption(DefaultOptionCreator.overwriteOption().create());
- //TODO: extend this to sequential mode
- addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored. "
- + "Default is 100%", false);
- addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);
-
- if (parseArguments(args) == null) {
- return false;
- }
-
- try {
- inputDirectory = getInputPath();
-
- useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD);
-
- if (useMapRed) {
- if (!hasOption("randomSelectionPct")) {
- throw new OptionException(getCLIOption("randomSelectionPct"),
- "must set randomSelectionPct when mapRed option is used");
- }
- if (!hasOption("mapRedOutputDir")) {
- throw new OptionException(getCLIOption("mapRedOutputDir"),
- "mapRedOutputDir must be set when mapRed option is used");
- }
- mapRedOutputDirectory = new Path(getOption("mapRedOutputDir"));
- if (hasOption("keepPct")) {
- keepPct = Integer.parseInt(getOption("keepPct"));
- }
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), mapRedOutputDirectory);
- }
- } else {
- if (!hasOption("trainingOutput")
- || !hasOption("testOutput")) {
- throw new OptionException(getCLIOption("trainingOutput"),
- "trainingOutput and testOutput must be set if mapRed option is not used");
- }
- if (!hasOption("testSplitSize")
- && !hasOption("testSplitPct")
- && !hasOption("randomSelectionPct")
- && !hasOption("randomSelectionSize")) {
- throw new OptionException(getCLIOption("testSplitSize"),
- "must set one of test split size/percentage or randomSelectionSize/percentage");
- }
-
- trainingOutputDirectory = new Path(getOption("trainingOutput"));
- testOutputDirectory = new Path(getOption("testOutput"));
- FileSystem fs = trainingOutputDirectory.getFileSystem(getConf());
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(fs.getConf(), trainingOutputDirectory);
- HadoopUtil.delete(fs.getConf(), testOutputDirectory);
- }
- fs.mkdirs(trainingOutputDirectory);
- fs.mkdirs(testOutputDirectory);
- }
-
- if (hasOption("charset")) {
- charset = Charset.forName(getOption("charset"));
- }
-
- if (hasOption("testSplitSize") && hasOption("testSplitPct")) {
- throw new OptionException(getCLIOption("testSplitPct"), "must have either split size or split percentage "
- + "option, not BOTH");
- }
-
- if (hasOption("testSplitSize")) {
- setTestSplitSize(Integer.parseInt(getOption("testSplitSize")));
- }
-
- if (hasOption("testSplitPct")) {
- setTestSplitPct(Integer.parseInt(getOption("testSplitPct")));
- }
-
- if (hasOption("splitLocation")) {
- setSplitLocation(Integer.parseInt(getOption("splitLocation")));
- }
-
- if (hasOption("randomSelectionSize")) {
- setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize")));
- }
-
- if (hasOption("randomSelectionPct")) {
- setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct")));
- }
-
- useSequence = hasOption("sequenceFiles");
-
- } catch (OptionException e) {
- log.error("Command-line option Exception", e);
- CommandLineUtil.printHelp(getGroup());
- return false;
- }
-
- validate();
- return true;
- }
-
- /**
- * Perform a split on directory specified by {@link #setInputDirectory(Path)} by calling {@link #splitFile(Path)}
- * on each file found within that directory.
- */
- public void splitDirectory() throws IOException, ClassNotFoundException, InterruptedException {
- this.splitDirectory(inputDirectory);
- }
-
- /**
- * Perform a split on the specified directory by calling {@link #splitFile(Path)} on each file found within that
- * directory.
- */
- public void splitDirectory(Path inputDir) throws IOException, ClassNotFoundException, InterruptedException {
- Configuration conf = getConf();
- splitDirectory(conf, inputDir);
- }
-
- /*
- * See also splitDirectory(Path inputDir)
- * */
- public void splitDirectory(Configuration conf, Path inputDir)
- throws IOException, ClassNotFoundException, InterruptedException {
- FileSystem fs = inputDir.getFileSystem(conf);
- if (fs.getFileStatus(inputDir) == null) {
- throw new IOException(inputDir + " does not exist");
- }
- if (!fs.getFileStatus(inputDir).isDir()) {
- throw new IOException(inputDir + " is not a directory");
- }
-
- if (useMapRed) {
- SplitInputJob.run(conf, inputDir, mapRedOutputDirectory,
- keepPct, testRandomSelectionPct);
- } else {
- // input dir contains one file per category.
- FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter());
- for (FileStatus inputFile : fileStats) {
- if (!inputFile.isDir()) {
- splitFile(inputFile.getPath());
- }
- }
- }
- }
-
- /**
- * Perform a split on the specified input file. Results will be written to files of the same name in the specified
- * training and test output directories. The {@link #validate()} method is called prior to executing the split.
- */
- public void splitFile(Path inputFile) throws IOException {
- Configuration conf = getConf();
- FileSystem fs = inputFile.getFileSystem(conf);
- if (fs.getFileStatus(inputFile) == null) {
- throw new IOException(inputFile + " does not exist");
- }
- if (fs.getFileStatus(inputFile).isDir()) {
- throw new IOException(inputFile + " is a directory");
- }
-
- validate();
-
- Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
- Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());
-
- int lineCount = countLines(fs, inputFile, charset);
-
- log.info("{} has {} lines", inputFile.getName(), lineCount);
-
- int testSplitStart = 0;
- int testSplitSize = this.testSplitSize; // don't modify state
- BitSet randomSel = null;
-
- if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
- testSplitSize = this.testRandomSelectionSize;
-
- if (testRandomSelectionPct > 0) {
- testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f);
- }
- log.info("{} test split size is {} based on random selection percentage {}",
- inputFile.getName(), testSplitSize, testRandomSelectionPct);
- long[] ridx = new long[testSplitSize];
- RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
- randomSel = new BitSet(lineCount);
- for (long idx : ridx) {
- randomSel.set((int) idx + 1);
- }
- } else {
- if (testSplitPct > 0) { // calculate split size based on percentage
- testSplitSize = Math.round(lineCount * testSplitPct / 100.0f);
- log.info("{} test split size is {} based on percentage {}",
- inputFile.getName(), testSplitSize, testSplitPct);
- } else {
- log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
- }
-
- if (splitLocation > 0) { // calculate start of split based on percentage
- testSplitStart = Math.round(lineCount * splitLocation / 100.0f);
- if (lineCount - testSplitStart < testSplitSize) {
- // adjust split start downwards based on split size.
- testSplitStart = lineCount - testSplitSize;
- }
- log.info("{} test split start is {} based on split location {}",
- inputFile.getName(), testSplitStart, splitLocation);
- }
-
- if (testSplitStart < 0) {
- throw new IllegalArgumentException("test split size for " + inputFile + " is too large, it would produce an "
- + "empty training set from the initial set of " + lineCount + " examples");
- } else if (lineCount - testSplitSize < testSplitSize) {
- log.warn("Test set size for {} may be too large, {} is larger than the number of "
- + "lines remaining in the training set: {}",
- inputFile, testSplitSize, lineCount - testSplitSize);
- }
- }
- int trainCount = 0;
- int testCount = 0;
- if (!useSequence) {
- try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
- Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
- Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset)){
-
- String line;
- int pos = 0;
- while ((line = reader.readLine()) != null) {
- pos++;
-
- Writer writer;
- if (testRandomSelectionPct > 0) { // Randomly choose
- writer = randomSel.get(pos) ? testWriter : trainingWriter;
- } else { // Choose based on location
- writer = pos > testSplitStart ? testWriter : trainingWriter;
- }
-
- if (writer == testWriter) {
- if (testCount >= testSplitSize) {
- writer = trainingWriter;
- } else {
- testCount++;
- }
- }
- if (writer == trainingWriter) {
- trainCount++;
- }
- writer.write(line);
- writer.write('\n');
- }
-
- }
- } else {
- try (SequenceFileIterator<Writable, Writable> iterator =
- new SequenceFileIterator<>(inputFile, false, fs.getConf());
- SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile,
- iterator.getKeyClass(), iterator.getValueClass());
- SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile,
- iterator.getKeyClass(), iterator.getValueClass())) {
-
- int pos = 0;
- while (iterator.hasNext()) {
- pos++;
- SequenceFile.Writer writer;
- if (testRandomSelectionPct > 0) { // Randomly choose
- writer = randomSel.get(pos) ? testWriter : trainingWriter;
- } else { // Choose based on location
- writer = pos > testSplitStart ? testWriter : trainingWriter;
- }
-
- if (writer == testWriter) {
- if (testCount >= testSplitSize) {
- writer = trainingWriter;
- } else {
- testCount++;
- }
- }
- if (writer == trainingWriter) {
- trainCount++;
- }
- Pair<Writable, Writable> pair = iterator.next();
- writer.append(pair.getFirst(), pair.getSecond());
- }
-
- }
- }
- log.info("file: {}, input: {} train: {}, test: {} starting at {}",
- inputFile.getName(), lineCount, trainCount, testCount, testSplitStart);
-
- // testing;
- if (callback != null) {
- callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
- }
- }
-
- public int getTestSplitSize() {
- return testSplitSize;
- }
-
- public void setTestSplitSize(int testSplitSize) {
- this.testSplitSize = testSplitSize;
- }
-
- public int getTestSplitPct() {
- return testSplitPct;
- }
-
- /**
- * Sets the percentage of the input data to allocate to the test split
- *
- * @param testSplitPct a value between 0 and 100 inclusive.
- */
- public void setTestSplitPct(int testSplitPct) {
- this.testSplitPct = testSplitPct;
- }
-
- /**
- * Sets the percentage of the input data to keep in a map reduce split input job
- *
- * @param keepPct a value between 0 and 100 inclusive.
- */
- public void setKeepPct(int keepPct) {
- this.keepPct = keepPct;
- }
-
- /**
- * Set to true to use map reduce to split the input
- *
- * @param useMapRed a boolean to indicate whether map reduce should be used
- */
- public void setUseMapRed(boolean useMapRed) {
- this.useMapRed = useMapRed;
- }
-
- public void setMapRedOutputDirectory(Path mapRedOutputDirectory) {
- this.mapRedOutputDirectory = mapRedOutputDirectory;
- }
-
- public int getSplitLocation() {
- return splitLocation;
- }
-
- /**
- * Set the location of the start of the test/training data split. Expressed as percentage of lines, for example
- * 0 indicates that the test data should be taken from the start of the file, 100 indicates that the test data
- * should be taken from the end of the input file, while 25 indicates that the test data should be taken from the
- * first quarter of the file.
- * 
- * This option is only relevant in cases where random selection is not employed
- *
- * @param splitLocation a value between 0 and 100 inclusive.
- */
- public void setSplitLocation(int splitLocation) {
- this.splitLocation = splitLocation;
- }
-
- public Charset getCharset() {
- return charset;
- }
-
- /**
- * Set the charset used to read and write files
- */
- public void setCharset(Charset charset) {
- this.charset = charset;
- }
-
- public Path getInputDirectory() {
- return inputDirectory;
- }
-
- /**
- * Set the directory from which input data will be read when the the {@link #splitDirectory()} method is invoked
- */
- public void setInputDirectory(Path inputDir) {
- this.inputDirectory = inputDir;
- }
-
- public Path getTrainingOutputDirectory() {
- return trainingOutputDirectory;
- }
-
- /**
- * Set the directory to which training data will be written.
- */
- public void setTrainingOutputDirectory(Path trainingOutputDir) {
- this.trainingOutputDirectory = trainingOutputDir;
- }
-
- public Path getTestOutputDirectory() {
- return testOutputDirectory;
- }
-
- /**
- * Set the directory to which test data will be written.
- */
- public void setTestOutputDirectory(Path testOutputDir) {
- this.testOutputDirectory = testOutputDir;
- }
-
- public SplitCallback getCallback() {
- return callback;
- }
-
- /**
- * Sets the callback used to inform the caller that an input file has been successfully split
- */
- public void setCallback(SplitCallback callback) {
- this.callback = callback;
- }
-
- public int getTestRandomSelectionSize() {
- return testRandomSelectionSize;
- }
-
- /**
- * Sets number of random input samples that will be saved to the test set.
- */
- public void setTestRandomSelectionSize(int testRandomSelectionSize) {
- this.testRandomSelectionSize = testRandomSelectionSize;
- }
-
- public int getTestRandomSelectionPct() {
-
- return testRandomSelectionPct;
- }
-
- /**
- * Sets number of random input samples that will be saved to the test set as a percentage of the size of the
- * input set.
- *
- * @param randomSelectionPct a value between 0 and 100 inclusive.
- */
- public void setTestRandomSelectionPct(int randomSelectionPct) {
- this.testRandomSelectionPct = randomSelectionPct;
- }
-
- /**
- * Validates that the current instance is in a consistent state
- *
- * @throws IllegalArgumentException if settings violate class invariants.
- * @throws IOException if output directories do not exist or are not directories.
- */
- public void validate() throws IOException {
- Preconditions.checkArgument(testSplitSize >= 1 || testSplitSize == -1,
- "Invalid testSplitSize: " + testSplitSize + ". Must be: testSplitSize >= 1 or testSplitSize = -1");
- Preconditions.checkArgument(splitLocation >= 0 && splitLocation <= 100 || splitLocation == -1,
- "Invalid splitLocation percentage: " + splitLocation + ". Must be: 0 <= splitLocation <= 100 or splitLocation = -1");
- Preconditions.checkArgument(testSplitPct >= 0 && testSplitPct <= 100 || testSplitPct == -1,
- "Invalid testSplitPct percentage: " + testSplitPct + ". Must be: 0 <= testSplitPct <= 100 or testSplitPct = -1");
- Preconditions.checkArgument(testRandomSelectionPct >= 0 && testRandomSelectionPct <= 100
- || testRandomSelectionPct == -1,"Invalid testRandomSelectionPct percentage: " + testRandomSelectionPct +
- ". Must be: 0 <= testRandomSelectionPct <= 100 or testRandomSelectionPct = -1");
-
- Preconditions.checkArgument(trainingOutputDirectory != null || useMapRed,
- "No training output directory was specified");
- Preconditions.checkArgument(testOutputDirectory != null || useMapRed, "No test output directory was specified");
-
- // only one of the following may be set, one must be set.
- int count = 0;
- if (testSplitSize > 0) {
- count++;
- }
- if (testSplitPct > 0) {
- count++;
- }
- if (testRandomSelectionSize > 0) {
- count++;
- }
- if (testRandomSelectionPct > 0) {
- count++;
- }
-
- Preconditions.checkArgument(count == 1, "Exactly one of testSplitSize, testSplitPct, testRandomSelectionSize, "
- + "testRandomSelectionPct should be set");
-
- if (!useMapRed) {
- Configuration conf = getConf();
- FileSystem fs = trainingOutputDirectory.getFileSystem(conf);
- FileStatus trainingOutputDirStatus = fs.getFileStatus(trainingOutputDirectory);
- Preconditions.checkArgument(trainingOutputDirStatus != null && trainingOutputDirStatus.isDir(),
- "%s is not a directory", trainingOutputDirectory);
- FileStatus testOutputDirStatus = fs.getFileStatus(testOutputDirectory);
- Preconditions.checkArgument(testOutputDirStatus != null && testOutputDirStatus.isDir(),
- "%s is not a directory", testOutputDirectory);
- }
- }
-
- /**
- * Count the lines in the file specified as returned by {@code BufferedReader.readLine()}
- *
- * @param inputFile the file whose lines will be counted
- * @param charset the charset of the file to read
- * @return the number of lines in the input file.
- * @throws IOException if there is a problem opening or reading the file.
- */
- public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException {
- int lineCount = 0;
- try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset))){
- while (reader.readLine() != null) {
- lineCount++;
- }
- }
- return lineCount;
- }
-
- /**
- * Used to pass information back to a caller once a file has been split without the need for a data object
- */
- public interface SplitCallback {
- void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java b/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java
deleted file mode 100644
index 4a1ff86..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.WritableComparator;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
-
-/**
- * Class which implements a map reduce version of SplitInput.
- * This class takes a SequenceFile input, e.g. a set of training data
- * for a learning algorithm, downsamples it, applies a random
- * permutation and splits it into test and training sets
- */
-public final class SplitInputJob {
-
- private static final String DOWNSAMPLING_FACTOR = "SplitInputJob.downsamplingFactor";
- private static final String RANDOM_SELECTION_PCT = "SplitInputJob.randomSelectionPct";
- private static final String TRAINING_TAG = "training";
- private static final String TEST_TAG = "test";
-
- private SplitInputJob() {}
-
- /**
- * Run job to downsample, randomly permute and split data into test and
- * training sets. This job takes a SequenceFile as input and outputs two
- * SequenceFiles test-r-00000 and training-r-00000 which contain the test and
- * training sets respectively
- *
- * @param initialConf
- * Initial configuration
- * @param inputPath
- * path to input data SequenceFile
- * @param outputPath
- * path for output data SequenceFiles
- * @param keepPct
- * percentage of key value pairs in input to keep. The rest are
- * discarded
- * @param randomSelectionPercent
- * percentage of key value pairs to allocate to test set. Remainder
- * are allocated to training set
- */
- @SuppressWarnings("rawtypes")
- public static void run(Configuration initialConf, Path inputPath,
- Path outputPath, int keepPct, float randomSelectionPercent)
- throws IOException, ClassNotFoundException, InterruptedException {
-
- int downsamplingFactor = (int) (100.0 / keepPct);
- initialConf.setInt(DOWNSAMPLING_FACTOR, downsamplingFactor);
- initialConf.setFloat(RANDOM_SELECTION_PCT, randomSelectionPercent);
-
- // Determine class of keys and values
- FileSystem fs = FileSystem.get(initialConf);
-
- SequenceFileDirIterator<? extends WritableComparable, Writable> iterator =
- new SequenceFileDirIterator<>(inputPath,
- PathType.LIST, PathFilters.partFilter(), null, false, fs.getConf());
- Class<? extends WritableComparable> keyClass;
- Class<? extends Writable> valueClass;
- if (iterator.hasNext()) {
- Pair<? extends WritableComparable, Writable> pair = iterator.next();
- keyClass = pair.getFirst().getClass();
- valueClass = pair.getSecond().getClass();
- } else {
- throw new IllegalStateException("Couldn't determine class of the input values");
- }
-
- Job job = new Job(new Configuration(initialConf));
-
- MultipleOutputs.addNamedOutput(job, TRAINING_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
- MultipleOutputs.addNamedOutput(job, TEST_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
- job.setJarByClass(SplitInputJob.class);
- FileInputFormat.addInputPath(job, inputPath);
- FileOutputFormat.setOutputPath(job, outputPath);
- job.setNumReduceTasks(1);
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
- job.setMapperClass(SplitInputMapper.class);
- job.setReducerClass(SplitInputReducer.class);
- job.setSortComparatorClass(SplitInputComparator.class);
- job.setOutputKeyClass(keyClass);
- job.setOutputValueClass(valueClass);
- job.submit();
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- throw new IllegalStateException("Job failed!");
- }
- }
-
- /** Mapper which downsamples the input by downsamplingFactor */
- public static class SplitInputMapper extends
- Mapper<WritableComparable<?>, Writable, WritableComparable<?>, Writable> {
-
- private int downsamplingFactor;
-
- @Override
- public void setup(Context ctx) {
- downsamplingFactor = ctx.getConfiguration().getInt(DOWNSAMPLING_FACTOR, 1);
- }
-
- /** Only run map() for one out of every downsampleFactor inputs */
- @Override
- public void run(Context context) throws IOException, InterruptedException {
- setup(context);
- int i = 0;
- while (context.nextKeyValue()) {
- if (i % downsamplingFactor == 0) {
- map(context.getCurrentKey(), context.getCurrentValue(), context);
- }
- i++;
- }
- cleanup(context);
- }
-
- }
-
- /** Reducer which uses MultipleOutputs to randomly allocate key value pairs between test and training outputs */
- public static class SplitInputReducer extends
- Reducer<WritableComparable<?>, Writable, WritableComparable<?>, Writable> {
-
- private MultipleOutputs multipleOutputs;
- private final Random rnd = RandomUtils.getRandom();
- private float randomSelectionPercent;
-
- @Override
- protected void setup(Context ctx) throws IOException {
- randomSelectionPercent = ctx.getConfiguration().getFloat(RANDOM_SELECTION_PCT, 0);
- multipleOutputs = new MultipleOutputs(ctx);
- }
-
- /**
- * Randomly allocate key value pairs between test and training sets.
- * randomSelectionPercent of the pairs will go to the test set.
- */
- @Override
- protected void reduce(WritableComparable<?> key, Iterable<Writable> values,
- Context context) throws IOException, InterruptedException {
- for (Writable value : values) {
- if (rnd.nextInt(100) < randomSelectionPercent) {
- multipleOutputs.write(TEST_TAG, key, value);
- } else {
- multipleOutputs.write(TRAINING_TAG, key, value);
- }
- }
-
- }
-
- @Override
- protected void cleanup(Context context) throws IOException {
- try {
- multipleOutputs.close();
- } catch (InterruptedException e) {
- throw new IOException(e);
- }
- }
-
- }
-
- /** Randomly permute key value pairs */
- public static class SplitInputComparator extends WritableComparator implements Serializable {
-
- private final Random rnd = RandomUtils.getRandom();
-
- protected SplitInputComparator() {
- super(WritableComparable.class);
- }
-
- @Override
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- if (rnd.nextBoolean()) {
- return 1;
- } else {
- return -1;
- }
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
deleted file mode 100644
index ac884d0..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.Vector;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.collect.Lists;
-
-/**
- * Base class for implementing ClusterWriter
- */
-public abstract class AbstractClusterWriter implements ClusterWriter {
-
- private static final Logger log = LoggerFactory.getLogger(AbstractClusterWriter.class);
-
- protected final Writer writer;
- protected final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
- protected final DistanceMeasure measure;
-
- /**
- *
- * @param writer The underlying {@link java.io.Writer} to use
- * @param clusterIdToPoints The map between cluster ids {@link org.apache.mahout.clustering.Cluster#getId()} and the
- * points in the cluster
- * @param measure The {@link org.apache.mahout.common.distance.DistanceMeasure} used to calculate the distance.
- * Some writers may wish to use it for calculating weights for display. May be null.
- */
- protected AbstractClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
- DistanceMeasure measure) {
- this.writer = writer;
- this.clusterIdToPoints = clusterIdToPoints;
- this.measure = measure;
- }
-
- protected Writer getWriter() {
- return writer;
- }
-
- protected Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() {
- return clusterIdToPoints;
- }
-
- public static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) {
-
- StringBuilder sb = new StringBuilder(100);
-
- for (Pair<String, Double> item : getTopPairs(vector, dictionary, numTerms)) {
- String term = item.getFirst();
- sb.append("\n\t\t");
- sb.append(StringUtils.rightPad(term, 40));
- sb.append("=>");
- sb.append(StringUtils.leftPad(item.getSecond().toString(), 20));
- }
- return sb.toString();
- }
-
- public static String getTopTerms(Vector vector, String[] dictionary, int numTerms) {
-
- StringBuilder sb = new StringBuilder(100);
-
- for (Pair<String, Double> item : getTopPairs(vector, dictionary, numTerms)) {
- String term = item.getFirst();
- sb.append(term).append('_');
- }
- sb.deleteCharAt(sb.length() - 1);
- return sb.toString();
- }
-
- @Override
- public long write(Iterable<ClusterWritable> iterable) throws IOException {
- return write(iterable, Long.MAX_VALUE);
- }
-
- @Override
- public void close() throws IOException {
- writer.close();
- }
-
- @Override
- public long write(Iterable<ClusterWritable> iterable, long maxDocs) throws IOException {
- long result = 0;
- Iterator<ClusterWritable> iterator = iterable.iterator();
- while (result < maxDocs && iterator.hasNext()) {
- write(iterator.next());
- result++;
- }
- return result;
- }
-
- private static Collection<Pair<String, Double>> getTopPairs(Vector vector, String[] dictionary, int numTerms) {
- List<TermIndexWeight> vectorTerms = Lists.newArrayList();
-
- for (Vector.Element elt : vector.nonZeroes()) {
- vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
- }
-
- // Sort results in reverse order (ie weight in descending order)
- Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
- @Override
- public int compare(TermIndexWeight one, TermIndexWeight two) {
- return Double.compare(two.weight, one.weight);
- }
- });
-
- Collection<Pair<String, Double>> topTerms = Lists.newLinkedList();
-
- for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
- int index = vectorTerms.get(i).index;
- String dictTerm = dictionary[index];
- if (dictTerm == null) {
- log.error("Dictionary entry missing for {}", index);
- continue;
- }
- topTerms.add(new Pair<>(dictTerm, vectorTerms.get(i).weight));
- }
-
- return topTerms;
- }
-
- private static class TermIndexWeight {
- private final int index;
- private final double weight;
-
- TermIndexWeight(int index, double weight) {
- this.index = index;
- this.weight = weight;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
deleted file mode 100644
index 7269016..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Format is adjacency style as put forth at http://gephi.org/users/supported-graph-formats/csv-format/, the centroid
- * is the first element and all the rest of the row are the points in that cluster
- *
- **/
-public class CSVClusterWriter extends AbstractClusterWriter {
-
- private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
-
- public CSVClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
- DistanceMeasure measure) {
- super(writer, clusterIdToPoints, measure);
- }
-
- @Override
- public void write(ClusterWritable clusterWritable) throws IOException {
- StringBuilder line = new StringBuilder();
- Cluster cluster = clusterWritable.getValue();
- line.append(cluster.getId());
- List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(cluster.getId());
- if (points != null) {
- for (WeightedPropertyVectorWritable point : points) {
- Vector theVec = point.getVector();
- line.append(',');
- if (theVec instanceof NamedVector) {
- line.append(((NamedVector)theVec).getName());
- } else {
- String vecStr = theVec.asFormatString();
- //do some basic manipulations for display
- vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
- line.append(vecStr);
- }
- }
- getWriter().append(line).append("\n");
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
deleted file mode 100644
index 75b5ded..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
+++ /dev/null
@@ -1,328 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.mahout.clustering.cdbw.CDbwEvaluator;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.utils.vectors.VectorHelper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class ClusterDumper extends AbstractJob {
-
- public static final String SAMPLE_POINTS = "samplePoints";
- DistanceMeasure measure;
-
- public enum OUTPUT_FORMAT {
- TEXT,
- CSV,
- GRAPH_ML,
- JSON,
- }
-
- public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
- public static final String DICTIONARY_OPTION = "dictionary";
- public static final String POINTS_DIR_OPTION = "pointsDir";
- public static final String NUM_WORDS_OPTION = "numWords";
- public static final String SUBSTRING_OPTION = "substring";
- public static final String EVALUATE_CLUSTERS = "evaluate";
-
- public static final String OUTPUT_FORMAT_OPT = "outputFormat";
-
- private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
- private Path seqFileDir;
- private Path pointsDir;
- private long maxPointsPerCluster = Long.MAX_VALUE;
- private String termDictionary;
- private String dictionaryFormat;
- private int subString = Integer.MAX_VALUE;
- private int numTopFeatures = 10;
- private Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
- private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT;
- private boolean runEvaluation;
-
- public ClusterDumper(Path seqFileDir, Path pointsDir) {
- this.seqFileDir = seqFileDir;
- this.pointsDir = pointsDir;
- init();
- }
-
- public ClusterDumper() {
- setConf(new Configuration());
- }
-
- public static void main(String[] args) throws Exception {
- new ClusterDumper().run(args);
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the results. Options: TEXT, CSV, JSON or GRAPH_ML",
- "TEXT");
- addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print");
- addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
- addOption(POINTS_DIR_OPTION, "p",
- "The directory containing points sequence files mapping input vectors to their cluster. "
- + "If specified, then the program will output the points associated with a cluster");
- addOption(SAMPLE_POINTS, "sp", "Specifies the maximum number of points to include _per_ cluster. The default "
- + "is to include all points");
- addOption(DICTIONARY_OPTION, "d", "The dictionary file");
- addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text");
- addOption(buildOption(EVALUATE_CLUSTERS, "e", "Run ClusterEvaluator and CDbwEvaluator over the input. "
- + "The output will be appended to the rest of the output at the end.", false, false, null));
- addOption(DefaultOptionCreator.distanceMeasureOption().create());
-
- // output is optional, will print to System.out per default
- if (parseArguments(args, false, true) == null) {
- return -1;
- }
-
- seqFileDir = getInputPath();
- if (hasOption(POINTS_DIR_OPTION)) {
- pointsDir = new Path(getOption(POINTS_DIR_OPTION));
- }
- outputFile = getOutputFile();
- if (hasOption(SUBSTRING_OPTION)) {
- int sub = Integer.parseInt(getOption(SUBSTRING_OPTION));
- if (sub >= 0) {
- subString = sub;
- }
- }
- termDictionary = getOption(DICTIONARY_OPTION);
- dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION);
- if (hasOption(NUM_WORDS_OPTION)) {
- numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION));
- }
- if (hasOption(OUTPUT_FORMAT_OPT)) {
- outputFormat = OUTPUT_FORMAT.valueOf(getOption(OUTPUT_FORMAT_OPT));
- }
- if (hasOption(SAMPLE_POINTS)) {
- maxPointsPerCluster = Long.parseLong(getOption(SAMPLE_POINTS));
- } else {
- maxPointsPerCluster = Long.MAX_VALUE;
- }
- runEvaluation = hasOption(EVALUATE_CLUSTERS);
- String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
- measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class);
-
- init();
- printClusters(null);
- return 0;
- }
-
- public void printClusters(String[] dictionary) throws Exception {
- Configuration conf = new Configuration();
-
- if (this.termDictionary != null) {
- if ("text".equals(dictionaryFormat)) {
- dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
- } else if ("sequencefile".equals(dictionaryFormat)) {
- dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary);
- } else {
- throw new IllegalArgumentException("Invalid dictionary format");
- }
- }
-
- Writer writer;
- boolean shouldClose;
- if (this.outputFile == null) {
- shouldClose = false;
- writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
- } else {
- shouldClose = true;
- if (outputFile.getName().startsWith("s3n://")) {
- Path p = outputPath;
- FileSystem fs = FileSystem.get(p.toUri(), conf);
- writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8);
- } else {
- Files.createParentDirs(outputFile);
- writer = Files.newWriter(this.outputFile, Charsets.UTF_8);
- }
- }
- ClusterWriter clusterWriter = createClusterWriter(writer, dictionary);
- try {
- long numWritten = clusterWriter.write(new SequenceFileDirValueIterable<ClusterWritable>(new Path(seqFileDir,
- "part-*"), PathType.GLOB, conf));
-
- writer.flush();
- if (runEvaluation) {
- HadoopUtil.delete(conf, new Path("tmp/representative"));
- int numIters = 5;
- RepresentativePointsDriver.main(new String[]{
- "--input", seqFileDir.toString(),
- "--output", "tmp/representative",
- "--clusteredPoints", pointsDir.toString(),
- "--distanceMeasure", measure.getClass().getName(),
- "--maxIter", String.valueOf(numIters)
- });
- conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, measure.getClass().getName());
- conf.set(RepresentativePointsDriver.STATE_IN_KEY, "tmp/representative/representativePoints-" + numIters);
- ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir);
- writer.append("\n");
- writer.append("Inter-Cluster Density: ").append(String.valueOf(ce.interClusterDensity())).append("\n");
- writer.append("Intra-Cluster Density: ").append(String.valueOf(ce.intraClusterDensity())).append("\n");
- CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir);
- writer.append("CDbw Inter-Cluster Density: ").append(String.valueOf(cdbw.interClusterDensity())).append("\n");
- writer.append("CDbw Intra-Cluster Density: ").append(String.valueOf(cdbw.intraClusterDensity())).append("\n");
- writer.append("CDbw Separation: ").append(String.valueOf(cdbw.separation())).append("\n");
- writer.flush();
- }
- log.info("Wrote {} clusters", numWritten);
- } finally {
- if (shouldClose) {
- Closeables.close(clusterWriter, false);
- } else {
- if (clusterWriter instanceof GraphMLClusterWriter) {
- clusterWriter.close();
- }
- }
- }
- }
-
- ClusterWriter createClusterWriter(Writer writer, String[] dictionary) throws IOException {
- ClusterWriter result;
-
- switch (outputFormat) {
- case TEXT:
- result = new ClusterDumperWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary, subString);
- break;
- case CSV:
- result = new CSVClusterWriter(writer, clusterIdToPoints, measure);
- break;
- case GRAPH_ML:
- result = new GraphMLClusterWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary, subString);
- break;
- case JSON:
- result = new JsonClusterWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary);
- break;
- default:
- throw new IllegalStateException("Unknown outputformat: " + outputFormat);
- }
- return result;
- }
-
- /**
- * Convenience function to set the output format during testing.
- */
- public void setOutputFormat(OUTPUT_FORMAT of) {
- outputFormat = of;
- }
-
- private void init() {
- if (this.pointsDir != null) {
- Configuration conf = new Configuration();
- // read in the points
- clusterIdToPoints = readPoints(this.pointsDir, maxPointsPerCluster, conf);
- } else {
- clusterIdToPoints = Collections.emptyMap();
- }
- }
-
-
- public int getSubString() {
- return subString;
- }
-
- public void setSubString(int subString) {
- this.subString = subString;
- }
-
- public Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() {
- return clusterIdToPoints;
- }
-
- public String getTermDictionary() {
- return termDictionary;
- }
-
- public void setTermDictionary(String termDictionary, String dictionaryType) {
- this.termDictionary = termDictionary;
- this.dictionaryFormat = dictionaryType;
- }
-
- public void setNumTopFeatures(int num) {
- this.numTopFeatures = num;
- }
-
- public int getNumTopFeatures() {
- return this.numTopFeatures;
- }
-
- public long getMaxPointsPerCluster() {
- return maxPointsPerCluster;
- }
-
- public void setMaxPointsPerCluster(long maxPointsPerCluster) {
- this.maxPointsPerCluster = maxPointsPerCluster;
- }
-
- public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path pointsPathDir,
- long maxPointsPerCluster,
- Configuration conf) {
- Map<Integer, List<WeightedPropertyVectorWritable>> result = new TreeMap<>();
- for (Pair<IntWritable, WeightedPropertyVectorWritable> record
- : new SequenceFileDirIterable<IntWritable, WeightedPropertyVectorWritable>(pointsPathDir, PathType.LIST,
- PathFilters.logsCRCFilter(), conf)) {
- // value is the cluster id as an int, key is the name/id of the
- // vector, but that doesn't matter because we only care about printing it
- //String clusterId = value.toString();
- int keyValue = record.getFirst().get();
- List<WeightedPropertyVectorWritable> pointList = result.get(keyValue);
- if (pointList == null) {
- pointList = new ArrayList<>();
- result.put(keyValue, pointList);
- }
- if (pointList.size() < maxPointsPerCluster) {
- pointList.add(record.getSecond());
- }
- }
- return result;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
deleted file mode 100644
index 31858c4..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.distance.DistanceMeasure;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Implements a {@link ClusterWriter} that outputs in the format used by ClusterDumper in Mahout 0.5
- */
-public class ClusterDumperWriter extends AbstractClusterWriter {
-
- private final int subString;
- private final String[] dictionary;
- private final int numTopFeatures;
-
- public ClusterDumperWriter(Writer writer, Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints,
- DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString) {
- super(writer, clusterIdToPoints, measure);
- this.numTopFeatures = numTopFeatures;
- this.dictionary = dictionary;
- this.subString = subString;
- }
-
- @Override
- public void write(ClusterWritable clusterWritable) throws IOException {
- Cluster cluster = clusterWritable.getValue();
- String fmtStr = cluster.asFormatString(dictionary);
- Writer writer = getWriter();
- if (subString > 0 && fmtStr.length() > subString) {
- writer.write(':');
- writer.write(fmtStr, 0, Math.min(subString, fmtStr.length()));
- } else {
- writer.write(fmtStr);
- }
-
- writer.write('\n');
-
- if (dictionary != null) {
- String topTerms = getTopFeatures(clusterWritable.getValue().getCenter(), dictionary, numTopFeatures);
- writer.write("\tTop Terms: ");
- writer.write(topTerms);
- writer.write('\n');
- }
-
- Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints = getClusterIdToPoints();
- List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(clusterWritable.getValue().getId());
- if (points != null) {
- writer.write("\tWeight : [props - optional]: Point:\n\t");
- for (Iterator<WeightedPropertyVectorWritable> iterator = points.iterator(); iterator.hasNext();) {
- WeightedPropertyVectorWritable point = iterator.next();
- writer.write(String.valueOf(point.getWeight()));
- Map<Text,Text> map = point.getProperties();
- // map can be null since empty maps when written are returned as null
- writer.write(" : [");
- if (map != null) {
- for (Map.Entry<Text,Text> entry : map.entrySet()) {
- writer.write(entry.getKey().toString());
- writer.write("=");
- writer.write(entry.getValue().toString());
- }
- }
- writer.write("]");
-
- writer.write(": ");
-
- writer.write(AbstractCluster.formatVector(point.getVector(), dictionary));
- if (iterator.hasNext()) {
- writer.write("\n\t");
- }
- }
- writer.write('\n');
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
deleted file mode 100644
index 70f8f6f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.Closeable;
-import java.io.IOException;
-
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-
-/**
- * Writes out clusters
- */
-public interface ClusterWriter extends Closeable {
-
- /**
- * Write all values in the Iterable to the output
- *
- * @param iterable The {@link Iterable} to loop over
- * @return the number of docs written
- * @throws java.io.IOException if there was a problem writing
- */
- long write(Iterable<ClusterWritable> iterable) throws IOException;
-
- /**
- * Write out a Cluster
- */
- void write(ClusterWritable clusterWritable) throws IOException;
-
- /**
- * Write the first {@code maxDocs} to the output.
- *
- * @param iterable The {@link Iterable} to loop over
- * @param maxDocs the maximum number of docs to write
- * @return The number of docs written
- * @throws IOException if there was a problem writing
- */
- long write(Iterable<ClusterWritable> iterable, long maxDocs) throws IOException;
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
deleted file mode 100644
index 25e8f3b..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
+++ /dev/null
@@ -1,216 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.clustering;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.regex.Pattern;
-
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.StringUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-
-/**
- * GraphML -- see http://gephi.org/users/supported-graph-formats/graphml-format/
- */
-public class GraphMLClusterWriter extends AbstractClusterWriter {
-
- private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
- private final Map<Integer, Color> colors = new HashMap<>();
- private Color lastClusterColor;
- private float lastX;
- private float lastY;
- private Random random;
- private int posStep;
- private final String[] dictionary;
- private final int numTopFeatures;
- private final int subString;
-
- public GraphMLClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
- DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString)
- throws IOException {
- super(writer, clusterIdToPoints, measure);
- this.dictionary = dictionary;
- this.numTopFeatures = numTopFeatures;
- this.subString = subString;
- init(writer);
- }
-
- private void init(Writer writer) throws IOException {
- writer.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
- writer.append("<graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\"\n"
- + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
- + "xsi:schemaLocation=\"http://graphml.graphdrawing.org/xmlns\n"
- + "http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd\">");
- //support rgb
- writer.append("<key attr.name=\"r\" attr.type=\"int\" for=\"node\" id=\"r\"/>\n"
- + "<key attr.name=\"g\" attr.type=\"int\" for=\"node\" id=\"g\"/>\n"
- + "<key attr.name=\"b\" attr.type=\"int\" for=\"node\" id=\"b\"/>"
- + "<key attr.name=\"size\" attr.type=\"int\" for=\"node\" id=\"size\"/>"
- + "<key attr.name=\"weight\" attr.type=\"float\" for=\"edge\" id=\"weight\"/>"
- + "<key attr.name=\"x\" attr.type=\"float\" for=\"node\" id=\"x\"/>"
- + "<key attr.name=\"y\" attr.type=\"float\" for=\"node\" id=\"y\"/>");
- writer.append("<graph edgedefault=\"undirected\">");
- lastClusterColor = new Color();
- posStep = (int) (0.1 * clusterIdToPoints.size()) + 100;
- random = RandomUtils.getRandom();
- }
-
- /*
- <?xml version="1.0" encoding="UTF-8"?>
- <graphml xmlns="http://graphml.graphdrawing.org/xmlns"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns
- http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">
- <graph id="G" edgedefault="undirected">
- <node id="n0"/>
- <node id="n1"/>
- <edge id="e1" source="n0" target="n1"/>
- </graph>
- </graphml>
- */
-
- @Override
- public void write(ClusterWritable clusterWritable) throws IOException {
- StringBuilder line = new StringBuilder();
- Cluster cluster = clusterWritable.getValue();
- Color rgb = getColor(cluster.getId());
-
- String topTerms = "";
- if (dictionary != null) {
- topTerms = getTopTerms(cluster.getCenter(), dictionary, numTopFeatures);
- }
- String clusterLabel = String.valueOf(cluster.getId()) + '_' + topTerms;
- //do some positioning so that items are visible and grouped together
- //TODO: put in a real layout algorithm
- float x = lastX + 1000;
- float y = lastY;
- if (x > (1000 + posStep)) {
- y = lastY + 1000;
- x = 0;
- }
-
- line.append(createNode(clusterLabel, rgb, x, y));
- List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(cluster.getId());
- if (points != null) {
- for (WeightedVectorWritable point : points) {
- Vector theVec = point.getVector();
- double distance = 1;
- if (measure != null) {
- //scale the distance
- distance = measure.distance(cluster.getCenter().getLengthSquared(), cluster.getCenter(), theVec) * 500;
- }
- String vecStr;
- int angle = random.nextInt(360); //pick an angle at random and then scale along that angle
- double angleRads = Math.toRadians(angle);
-
- float targetX = x + (float) (distance * Math.cos(angleRads));
- float targetY = y + (float) (distance * Math.sin(angleRads));
- if (theVec instanceof NamedVector) {
- vecStr = ((NamedVector) theVec).getName();
- } else {
- vecStr = theVec.asFormatString();
- //do some basic manipulations for display
- vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
- }
- if (subString > 0 && vecStr.length() > subString) {
- vecStr = vecStr.substring(0, subString);
- }
- line.append(createNode(vecStr, rgb, targetX, targetY));
- line.append(createEdge(clusterLabel, vecStr, distance));
- }
- }
- lastClusterColor = rgb;
- lastX = x;
- lastY = y;
- getWriter().append(line).append("\n");
- }
-
- private Color getColor(int clusterId) {
- Color result = colors.get(clusterId);
- if (result == null) {
- result = new Color();
- //there is probably some better way to color a graph
- int incR = 0;
- int incG = 0;
- int incB = 0;
- if (lastClusterColor.r + 20 < 256 && lastClusterColor.g + 20 < 256 && lastClusterColor.b + 20 < 256) {
- incR = 20;
- incG = 0;
- incB = 0;
- } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 < 256 && lastClusterColor.b + 20 < 256) {
- incG = 20;
- incB = 0;
- } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 >= 256 && lastClusterColor.b + 20 < 256) {
- incB = 20;
- } else {
- incR += 3;
- incG += 3;
- incR += 3;
- }
- result.r = (lastClusterColor.r + incR) % 256;
- result.g = (lastClusterColor.g + incG) % 256;
- result.b = (lastClusterColor.b + incB) % 256;
- colors.put(clusterId, result);
- }
- return result;
- }
-
- private static String createEdge(String left, String right, double distance) {
- left = StringUtils.escapeXML(left);
- right = StringUtils.escapeXML(right);
- return "<edge id=\"" + left + '_' + right + "\" source=\"" + left + "\" target=\"" + right + "\">"
- + "<data key=\"weight\">" + distance + "</data></edge>";
- }
-
- private static String createNode(String s, Color rgb, float x, float y) {
- return "<node id=\"" + StringUtils.escapeXML(s) + "\"><data key=\"r\">" + rgb.r
- + "</data>"
- + "<data key=\"g\">" + rgb.g
- + "</data>"
- + "<data key=\"b\">" + rgb.b
- + "</data>"
- + "<data key=\"x\">" + x
- + "</data>"
- + "<data key=\"y\">" + y
- + "</data>"
- + "</node>";
- }
-
- @Override
- public void close() throws IOException {
- getWriter().append("</graph>").append("</graphml>");
- super.close();
- }
-
- private static class Color {
- int r;
- int g;
- int b;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java b/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
deleted file mode 100644
index d564a73..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
+++ /dev/null
@@ -1,188 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils.clustering;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.codehaus.jackson.map.ObjectMapper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Dump cluster info to JSON formatted lines. Heavily inspired by
- * ClusterDumperWriter.java and CSVClusterWriter.java
- *
- */
-public class JsonClusterWriter extends AbstractClusterWriter {
- private final String[] dictionary;
- private final int numTopFeatures;
- private final ObjectMapper jxn;
-
- private static final Logger log = LoggerFactory.getLogger(JsonClusterWriter.class);
- private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
-
- public JsonClusterWriter(Writer writer,
- Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
- DistanceMeasure measure, int numTopFeatures, String[] dictionary) {
- super(writer, clusterIdToPoints, measure);
- this.numTopFeatures = numTopFeatures;
- this.dictionary = dictionary;
- jxn = new ObjectMapper();
- }
-
- /**
- * Generate HashMap with cluster info and write as a single JSON formatted
- * line
- */
- @Override
- public void write(ClusterWritable clusterWritable) throws IOException {
- Map<String, Object> res = new HashMap<>();
-
- // get top terms
- if (dictionary != null) {
- List<Object> topTerms = getTopFeaturesList(clusterWritable.getValue()
- .getCenter(), dictionary, numTopFeatures);
- res.put("top_terms", topTerms);
- } else {
- res.put("top_terms", new ArrayList<>());
- }
-
- // get human-readable cluster representation
- Cluster cluster = clusterWritable.getValue();
- res.put("cluster_id", cluster.getId());
-
- if (dictionary != null) {
- Map<String,Object> fmtStr = cluster.asJson(dictionary);
- res.put("cluster", fmtStr);
-
- // get points
- List<Object> points = getPoints(cluster, dictionary);
- res.put("points", points);
- } else {
- res.put("cluster", new HashMap<>());
- res.put("points", new ArrayList<>());
- }
-
- // write JSON
- Writer writer = getWriter();
- writer.write(jxn.writeValueAsString(res) + "\n");
- }
-
- /**
- * Create a List of HashMaps containing top terms information
- *
- * @return List<Object>
- */
- public List<Object> getTopFeaturesList(Vector vector, String[] dictionary,
- int numTerms) {
-
- List<TermIndexWeight> vectorTerms = new ArrayList<>();
-
- for (Vector.Element elt : vector.nonZeroes()) {
- vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
- }
-
- // Sort results in reverse order (i.e. weight in descending order)
- Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
- @Override
- public int compare(TermIndexWeight one, TermIndexWeight two) {
- return Double.compare(two.weight, one.weight);
- }
- });
-
- List<Object> topTerms = new ArrayList<>();
-
- for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
- int index = vectorTerms.get(i).index;
- String dictTerm = dictionary[index];
- if (dictTerm == null) {
- log.error("Dictionary entry missing for {}", index);
- continue;
- }
- Map<String, Object> term_entry = new HashMap<>();
- term_entry.put(dictTerm, vectorTerms.get(i).weight);
- topTerms.add(term_entry);
- }
-
- return topTerms;
- }
-
- /**
- * Create a List of HashMaps containing Vector point information
- *
- * @return List<Object>
- */
- public List<Object> getPoints(Cluster cluster, String[] dictionary) {
- List<Object> vectorObjs = new ArrayList<>();
- List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(
- cluster.getId());
-
- if (points != null) {
- for (WeightedPropertyVectorWritable point : points) {
- Map<String, Object> entry = new HashMap<>();
- Vector theVec = point.getVector();
- if (theVec instanceof NamedVector) {
- entry.put("vector_name", ((NamedVector) theVec).getName());
- } else {
- String vecStr = theVec.asFormatString();
- // do some basic manipulations for display
- vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
- entry.put("vector_name", vecStr);
- }
- entry.put("weight", String.valueOf(point.getWeight()));
- try {
- entry.put("point",
- AbstractCluster.formatVectorAsJson(point.getVector(), dictionary));
- } catch (IOException e) {
- log.error("IOException: ", e);
- }
- vectorObjs.add(entry);
- }
- }
- return vectorObjs;
- }
-
- /**
- * Convenience class for sorting terms
- *
- */
- private static class TermIndexWeight {
- private final int index;
- private final double weight;
-
- TermIndexWeight(int index, double weight) {
- this.index = index;
- this.weight = weight;
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java b/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
deleted file mode 100644
index 54ad43f..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
+++ /dev/null
@@ -1,186 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.email;
-
-import java.io.File;
-import java.nio.charset.Charset;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * Configuration options to be used by {@link MailProcessor}. Includes options controlling the exact output format
- * and which mail fields are included (body, to, from, subject, etc.)
- */
-public class MailOptions {
-
- public static final String FROM = "FROM";
- public static final String TO = "TO";
- public static final String REFS = "REFS";
- public static final String SUBJECT = "SUBJECT";
- public static final Pattern DEFAULT_QUOTED_TEXT = Pattern.compile("^(\\||>)");
-
- private boolean stripQuotedText;
- private File input;
- private String outputDir;
- private String prefix;
- private int chunkSize;
- private Charset charset;
- private String separator;
- private String bodySeparator = "\n";
- private boolean includeBody;
- private Pattern[] patternsToMatch;
- //maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in patternsToMatch. See MailToRecMapper
- private Map<String, Integer> patternOrder;
-
- //the regular expression to use for identifying quoted text.
- private Pattern quotedTextPattern = DEFAULT_QUOTED_TEXT;
-
- public File getInput() {
- return input;
- }
-
- public void setInput(File input) {
- this.input = input;
- }
-
- public String getOutputDir() {
- return outputDir;
- }
-
- /**
- * Sets the output directory where sequence files will be written.
- */
- public void setOutputDir(String outputDir) {
- this.outputDir = outputDir;
- }
-
- public String getPrefix() {
- return prefix;
- }
-
- /**
- * Sets the prefix that is combined with the archive name and with message ids to create {@code SequenceFile} keys.
- * @param prefix The name of the directory containing the mail archive is commonly used.
- */
- public void setPrefix(String prefix) {
- this.prefix = prefix;
- }
-
- public int getChunkSize() {
- return chunkSize;
- }
-
- /**
- * Sets the size of each generated sequence file, in Megabytes.
- */
- public void setChunkSize(int chunkSize) {
- this.chunkSize = chunkSize;
- }
-
- public Charset getCharset() {
- return charset;
- }
-
- /**
- * Sets the encoding of the input
- */
- public void setCharset(Charset charset) {
- this.charset = charset;
- }
-
- public String getSeparator() {
- return separator;
- }
-
- /**
- * Sets the separator to use in the output between metadata items (to, from, etc.).
- */
- public void setSeparator(String separator) {
- this.separator = separator;
- }
-
- public String getBodySeparator() {
- return bodySeparator;
- }
-
- /**
- * Sets the separator to use in the output between lines in the body, the default is "\n".
- */
- public void setBodySeparator(String bodySeparator) {
- this.bodySeparator = bodySeparator;
- }
-
- public boolean isIncludeBody() {
- return includeBody;
- }
-
- /**
- * Sets whether mail bodies are included in the output
- */
- public void setIncludeBody(boolean includeBody) {
- this.includeBody = includeBody;
- }
-
- public Pattern[] getPatternsToMatch() {
- return patternsToMatch;
- }
-
- /**
- * Sets the list of patterns to be applied in the given order to extract metadata fields (to, from, subject, etc.)
- * from the input
- */
- public void setPatternsToMatch(Pattern[] patternsToMatch) {
- this.patternsToMatch = patternsToMatch;
- }
-
- public Map<String, Integer> getPatternOrder() {
- return patternOrder;
- }
-
- public void setPatternOrder(Map<String, Integer> patternOrder) {
- this.patternOrder = patternOrder;
- }
-
- /**
- *
- * @return true if we should strip out quoted email text
- */
- public boolean isStripQuotedText() {
- return stripQuotedText;
- }
-
- /**
- *
- * Sets whether quoted text such as lines starting with | or > is striped off.
- */
- public void setStripQuotedText(boolean stripQuotedText) {
- this.stripQuotedText = stripQuotedText;
- }
-
- public Pattern getQuotedTextPattern() {
- return quotedTextPattern;
- }
-
- /**
- * Sets the {@link java.util.regex.Pattern} to use to identify lines that are quoted text. Default is | and >
- * @see #setStripQuotedText(boolean)
- */
- public void setQuotedTextPattern(Pattern quotedTextPattern) {
- this.quotedTextPattern = quotedTextPattern;
- }
-}

r***@apache.org

2018-06-27 14:52:01 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java
deleted file mode 100644
index b838430..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java
+++ /dev/null
@@ -1,172 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.common.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-
-/**
- * 
- * A {@link org.apache.mahout.cf.taste.model.JDBCDataModel} backed by a PostgreSQL database and
- * accessed via JDBC. It may work with other JDBC databases. By default, this class
- * assumes that there is a {@link javax.sql.DataSource} available under the JNDI name
- * "jdbc/taste", which gives access to a database with a "taste_preferences" table with the following schema:
- * 
- *
- * 
- *
- * <pre>
- * CREATE TABLE taste_preferences (
- * user_id BIGINT NOT NULL,
- * item_id BIGINT NOT NULL,
- * preference REAL NOT NULL,
- * PRIMARY KEY (user_id, item_id)
- * )
- * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
- * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
- * </pre>
- *
- * 
- *
- * @see PostgreSQLJDBCDataModel
- */
-public class PostgreSQLJDBCDataModel extends SQL92JDBCDataModel {
-
- private static final Logger log = LoggerFactory.getLogger(PostgreSQLJDBCDataModel.class);
-
- private static final String POSTGRESQL_DUPLICATE_KEY_STATE = "23505"; // this is brittle...
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} (named
- * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
- * 
- *
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public PostgreSQLJDBCDataModel() throws TasteException {
- }
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} found under the given name, and
- * using default table/column names.
- * 
- *
- * @param dataSourceName name of {@link javax.sql.DataSource} to look up
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public PostgreSQLJDBCDataModel(String dataSourceName) throws TasteException {
- super(dataSourceName);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
- * 
- *
- * @param dataSource {@link javax.sql.DataSource} to use
- */
- public PostgreSQLJDBCDataModel(DataSource dataSource) {
- super(dataSource);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
- * 
- *
- * @param dataSource {@link javax.sql.DataSource} to use
- * @param preferenceTable name of table containing preference data
- * @param userIDColumn user ID column name
- * @param itemIDColumn item ID column name
- * @param preferenceColumn preference column name
- * @param timestampColumn timestamp column name (may be null)
- */
- public PostgreSQLJDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String preferenceColumn,
- String timestampColumn) {
- super(dataSource, preferenceTable, userIDColumn, itemIDColumn, preferenceColumn, timestampColumn);
- }
-
- /**
- * Override since PostgreSQL doesn't have the same non-standard capability that MySQL has, to optionally
- * insert or update in one statement.
- */
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
-
- log.debug("Setting preference for user {}, item {}", userID, itemID);
-
- String setPreferenceSQL = getSetPreferenceSQL();
-
- Connection conn = null;
- PreparedStatement stmt1 = null;
- PreparedStatement stmt2 = null;
- try {
- conn = getDataSource().getConnection();
-
- stmt1 = conn.prepareStatement(setPreferenceSQL);
- setLongParameter(stmt1, 1, userID);
- setLongParameter(stmt1, 2, itemID);
- stmt1.setDouble(3, value);
-
- log.debug("Executing SQL update: {}", setPreferenceSQL);
- try {
- stmt1.executeUpdate();
- } catch (SQLException sqle) {
- if (!POSTGRESQL_DUPLICATE_KEY_STATE.equals(sqle.getSQLState())) {
- throw sqle;
- }
- }
-
- // Continue with update; just found the key already exists
-
- stmt2 = conn.prepareStatement(getUpdatePreferenceSQL());
- stmt2.setDouble(1, value);
- setLongParameter(stmt2, 2, userID);
- setLongParameter(stmt2, 3, itemID);
-
- log.debug("Executing SQL update: {}", getUpdatePreferenceSQL());
- stmt2.executeUpdate();
-
- } catch (SQLException sqle) {
- log.warn("Exception while setting preference", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(null, stmt1, null);
- IOUtils.quietClose(null, stmt2, null);
- IOUtils.quietClose(null, null, conn);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java
deleted file mode 100644
index 0827416..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java
+++ /dev/null
@@ -1,178 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
-import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
-import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.JDBCDataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Collection;
-import java.util.concurrent.Callable;
-
-/**
- * A {@link DataModel} which loads, and can re-load, data from a JDBC-backed {@link JDBCDataModel} into memory, as a
- * {@link GenericDataModel} or {@link GenericBooleanPrefDataModel}. It is intended to provide the speed
- * advantage of in-memory representation but be able to update periodically to pull in new data from a database source.
- */
-public final class ReloadFromJDBCDataModel implements DataModel {
-
- private static final Logger log = LoggerFactory.getLogger(ReloadFromJDBCDataModel.class);
-
- private DataModel delegateInMemory;
- private final JDBCDataModel delegate;
- private final RefreshHelper refreshHelper;
-
- public ReloadFromJDBCDataModel(JDBCDataModel delegate) throws TasteException {
- this.delegate = Preconditions.checkNotNull(delegate);
- refreshHelper = new RefreshHelper(new Callable<Void>() {
- @Override
- public Void call() {
- reload();
- return null; //To change body of implemented methods use File | Settings | File Templates.
- }
- });
- refreshHelper.addDependency(delegate);
- reload();
- if (delegateInMemory == null) {
- throw new TasteException("Failed to load data into memory");
- }
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- refreshHelper.refresh(alreadyRefreshed);
- }
-
- private void reload() {
- try {
- // Load new in-memory representation,
- log.info("Loading new JDBC delegate data...");
- DataModel newDelegateInMemory =
- delegate.hasPreferenceValues()
- ? new GenericDataModel(delegate.exportWithPrefs())
- : new GenericBooleanPrefDataModel(delegate.exportWithIDsOnly());
- // and then swap to it.
- log.info("New data loaded.");
- delegateInMemory = newDelegateInMemory;
- } catch (TasteException te) {
- log.warn("Error while reloading JDBC delegate data", te);
- // But continue with whatever is loaded
- }
- }
-
- public JDBCDataModel getDelegate() {
- return delegate;
- }
-
- public DataModel getDelegateInMemory() {
- return delegateInMemory;
- }
-
- // Delegated methods:
-
- @Override
- public LongPrimitiveIterator getUserIDs() throws TasteException {
- return delegateInMemory.getUserIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesFromUser(long id) throws TasteException {
- return delegateInMemory.getPreferencesFromUser(id);
- }
-
- @Override
- public FastIDSet getItemIDsFromUser(long id) throws TasteException {
- return delegateInMemory.getItemIDsFromUser(id);
- }
-
- @Override
- public Float getPreferenceValue(long userID, long itemID) throws TasteException {
- return delegateInMemory.getPreferenceValue(userID, itemID);
- }
-
- @Override
- public Long getPreferenceTime(long userID, long itemID) throws TasteException {
- return delegateInMemory.getPreferenceTime(userID, itemID);
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() throws TasteException {
- return delegateInMemory.getItemIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
- return delegateInMemory.getPreferencesForItem(itemID);
- }
-
- @Override
- public int getNumItems() throws TasteException {
- return delegateInMemory.getNumItems();
- }
-
- @Override
- public int getNumUsers() throws TasteException {
- return delegateInMemory.getNumUsers();
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
- return delegateInMemory.getNumUsersWithPreferenceFor(itemID);
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
- return delegateInMemory.getNumUsersWithPreferenceFor(itemID1, itemID2);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- delegateInMemory.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- delegateInMemory.removePreference(userID, itemID);
- }
-
- @Override
- public boolean hasPreferenceValues() {
- return delegateInMemory.hasPreferenceValues();
- }
-
- @Override
- public float getMaxPreference() {
- return delegateInMemory.getMaxPreference();
- }
-
- @Override
- public float getMinPreference() {
- return delegateInMemory.getMinPreference();
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java
deleted file mode 100644
index 19c575f..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java
+++ /dev/null
@@ -1,221 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-import org.apache.mahout.common.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-/**
- * 
- * See also {@link org.apache.mahout.cf.taste.impl.model.jdbc.SQL92JDBCDataModel} --
- * same except deals with a table without preference info:
- * 
- *
- * 
- *
- * <pre>
- * CREATE TABLE taste_preferences (
- * user_id BIGINT NOT NULL,
- * item_id BIGINT NOT NULL,
- * PRIMARY KEY (user_id, item_id)
- * );
- * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
- * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
- * </pre>
- *
- * 
- *
- * @see SQL92JDBCDataModel
- */
-public class SQL92BooleanPrefJDBCDataModel extends AbstractBooleanPrefJDBCDataModel {
-
- private static final Logger log = LoggerFactory.getLogger(SQL92BooleanPrefJDBCDataModel.class);
-
- private final String verifyPreferenceSQL;
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} (named
- * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
- * 
- *
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public SQL92BooleanPrefJDBCDataModel() throws TasteException {
- this(DEFAULT_DATASOURCE_NAME);
- }
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} found
- * under the given name, and using default table/column names.
- * 
- *
- * @param dataSourceName
- * name of {@link javax.sql.DataSource} to look up
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public SQL92BooleanPrefJDBCDataModel(String dataSourceName) throws TasteException {
- this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default
- * table/column names.
- * 
- *
- * @param dataSource
- * {@link javax.sql.DataSource} to use
- */
- public SQL92BooleanPrefJDBCDataModel(DataSource dataSource) {
- this(dataSource,
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default
- * table/column names.
- * 
- *
- * @param dataSource
- * {@link javax.sql.DataSource} to use
- * @param preferenceTable
- * name of table containing preference data
- * @param userIDColumn
- * user ID column name
- * @param itemIDColumn
- * item ID column name
- * @param timestampColumn timestamp column name (may be null)
- */
- public SQL92BooleanPrefJDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String timestampColumn) {
- super(dataSource, preferenceTable, userIDColumn, itemIDColumn,
- NO_SUCH_COLUMN,
- // getPreferenceSQL
- "SELECT 1 FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
- // getPreferenceTimeSQL
- "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
- + itemIDColumn + "=?",
- // getUserSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
- + userIDColumn + "=?",
- // getAllUsersSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY "
- + userIDColumn,
- // getNumItemsSQL
- "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
- // getNumUsersSQL
- "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
- // setPreferenceSQL
- "INSERT INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ") VALUES (?,?)",
- // removePreference SQL
- "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
- // getUsersSQL
- "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
- // getItemsSQL
- "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
- // getPrefsForItemSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
- + itemIDColumn + "=? ORDER BY " + userIDColumn,
- // getNumPreferenceForItemSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
- // getNumPreferenceForItemsSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
- + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
- // getMaxPreferenceSQL
- "SELECT 1.0",
- // getMinPreferenceSQL
- "SELECT 1.0");
-
- verifyPreferenceSQL = "SELECT 1 FROM " + preferenceTable + " WHERE " + userIDColumn
- + "=? AND " + itemIDColumn + "=?";
- }
-
- protected String getVerifyPreferenceSQL() {
- return verifyPreferenceSQL;
- }
-
- /**
- * Override since PostgreSQL doesn't have the same non-standard capability that MySQL has, to optionally
- * ignore an insert that fails since the row exists already.
- */
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
- log.debug("Setting preference for user {}, item {}", userID, itemID);
-
- String setPreferenceSQL = getSetPreferenceSQL();
-
- Connection conn = null;
- PreparedStatement stmt1 = null;
- PreparedStatement stmt2 = null;
- ResultSet rs = null;
- try {
- conn = getDataSource().getConnection();
-
- stmt1 = conn.prepareStatement(verifyPreferenceSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- setLongParameter(stmt1, 1, userID);
- setLongParameter(stmt1, 2, itemID);
- rs = stmt1.executeQuery();
-
- // test if the record exists already.
- if (!rs.first()) {
- stmt2 = conn.prepareStatement(setPreferenceSQL);
- setLongParameter(stmt2, 1, userID);
- setLongParameter(stmt2, 2, itemID);
- stmt2.setDouble(3, value);
-
- log.debug("Executing SQL update: {}", setPreferenceSQL);
- stmt2.executeUpdate();
- }
- } catch (SQLException sqle) {
- log.warn("Exception while setting preference", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs);
- IOUtils.quietClose(stmt1);
- IOUtils.quietClose(stmt2);
- IOUtils.quietClose(conn);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java
deleted file mode 100644
index 39de620..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java
+++ /dev/null
@@ -1,248 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-import org.apache.mahout.common.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-/**
- * 
- * A {@link org.apache.mahout.cf.taste.model.JDBCDataModel} backed by a SQL92 compatible database and
- * accessed via JDBC. It should work with most JDBC databases, although not optimized for performance.
- * By default, this class assumes that there is a {@link javax.sql.DataSource} available under the JNDI name
- * "jdbc/taste", which gives access to a database with a "taste_preferences" table with the following schema:
- * 
- *
- * 
- *
- * <pre>
- * CREATE TABLE taste_preferences (
- * user_id BIGINT NOT NULL,
- * item_id BIGINT NOT NULL,
- * preference REAL NOT NULL,
- * PRIMARY KEY (user_id, item_id)
- * )
- * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
- * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
- * </pre>
- *
- * 
- *
- * @see SQL92BooleanPrefJDBCDataModel
- */
-public class SQL92JDBCDataModel extends AbstractJDBCDataModel {
-
- private static final Logger log = LoggerFactory.getLogger(SQL92JDBCDataModel.class);
-
- private final String updatePreferenceSQL;
- private final String verifyPreferenceSQL;
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} (named
- * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
- * 
- *
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public SQL92JDBCDataModel() throws TasteException {
- this(DEFAULT_DATASOURCE_NAME);
- }
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} found under the given name, and
- * using default table/column names.
- * 
- *
- * @param dataSourceName
- * name of {@link javax.sql.DataSource} to look up
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public SQL92JDBCDataModel(String dataSourceName) throws TasteException {
- this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
- * 
- *
- * @param dataSource
- * {@link javax.sql.DataSource} to use
- */
- public SQL92JDBCDataModel(DataSource dataSource) {
- this(dataSource,
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
- * 
- *
- * @param dataSource
- * {@link javax.sql.DataSource} to use
- * @param preferenceTable
- * name of table containing preference data
- * @param userIDColumn
- * user ID column name
- * @param itemIDColumn
- * item ID column name
- * @param preferenceColumn
- * preference column name
- * @param timestampColumn timestamp column name (may be null)
- */
- public SQL92JDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String preferenceColumn,
- String timestampColumn) {
- super(dataSource, preferenceTable, userIDColumn, itemIDColumn, preferenceColumn,
- // getPreferenceSQL
- "SELECT " + preferenceColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
- + itemIDColumn + "=?",
- // getPreferenceTimeSQL
- "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
- + itemIDColumn + "=?",
- // getUserSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
- + " WHERE " + userIDColumn + "=? ORDER BY " + itemIDColumn,
- // getAllUsersSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
- + " ORDER BY " + userIDColumn + ", " + itemIDColumn,
- // getNumItemsSQL
- "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
- // getNumUsersSQL
- "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
- // setPreferenceSQL
- "INSERT INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ',' + preferenceColumn
- + ") VALUES (?,?,?)",
- // removePreference SQL
- "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
- // getUsersSQL
- "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
- // getItemsSQL
- "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
- // getPrefsForItemSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
- + " WHERE " + itemIDColumn + "=? ORDER BY " + userIDColumn,
- // getNumPreferenceForItemSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
- // getNumPreferenceForItemsSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
- + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
- // getMaxPreferenceSQL
- "SELECT MAX(" + preferenceColumn + ") FROM " + preferenceTable,
- // getMinPreferenceSQL
- "SELECT MIN(" + preferenceColumn + ") FROM " + preferenceTable);
-
- updatePreferenceSQL = "UPDATE " + preferenceTable + " SET " + preferenceColumn + "=? WHERE " + userIDColumn
- + "=? AND " + itemIDColumn + "=?";
- verifyPreferenceSQL = "SELECT " + preferenceColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn
- + "=? AND " + itemIDColumn + "=?";
- }
-
- protected String getUpdatePreferenceSQL() {
- return updatePreferenceSQL;
- }
-
- protected String getVerifyPreferenceSQL() {
- return verifyPreferenceSQL;
- }
-
- /**
- * Override since SQL92 doesn't have the same non-standard capability that MySQL has, to optionally
- * insert or update in one statement.
- */
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
- log.debug("Setting preference for user {}, item {}", userID, itemID);
-
- String setPreferenceSQL = getSetPreferenceSQL();
-
- Connection conn = null;
- PreparedStatement stmt1 = null;
- PreparedStatement stmt2 = null;
- PreparedStatement stmt3 = null;
- ResultSet rs = null;
- try {
- conn = getDataSource().getConnection();
-
- stmt1 = conn.prepareStatement(verifyPreferenceSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- setLongParameter(stmt1, 1, userID);
- setLongParameter(stmt1, 2, itemID);
- rs = stmt1.executeQuery();
-
- // test if the record exists already.
- if (rs.first()) {
- // then we update the record.
- stmt2 = conn.prepareStatement(updatePreferenceSQL);
- stmt2.setDouble(1, value);
- setLongParameter(stmt2, 2, userID);
- setLongParameter(stmt2, 3, itemID);
-
- log.debug("Executing SQL update: {}", updatePreferenceSQL);
- stmt2.executeUpdate();
-
- } else {
- // we'll insert the record
- stmt3 = conn.prepareStatement(setPreferenceSQL);
- setLongParameter(stmt3, 1, userID);
- setLongParameter(stmt3, 2, itemID);
- stmt3.setDouble(3, value);
-
- log.debug("Executing SQL update: {}", setPreferenceSQL);
- stmt3.executeUpdate();
- }
- } catch (SQLException sqle) {
- log.warn("Exception while setting preference", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs);
- IOUtils.quietClose(stmt1);
- IOUtils.quietClose(stmt2);
- IOUtils.quietClose(stmt3);
- IOUtils.quietClose(conn);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java
deleted file mode 100644
index 92a4019..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java
+++ /dev/null
@@ -1,873 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.mongodb;
-
-import com.google.common.base.Preconditions;
-import com.mongodb.BasicDBObject;
-import com.mongodb.DB;
-import com.mongodb.DBCollection;
-import com.mongodb.DBCursor;
-import com.mongodb.DBObject;
-import com.mongodb.Mongo;
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.NoSuchUserException;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
-import org.apache.mahout.cf.taste.impl.model.GenericPreference;
-import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.bson.types.ObjectId;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.net.UnknownHostException;
-import java.text.DateFormat;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Date;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.concurrent.locks.ReentrantLock;
-import java.util.regex.Pattern;
-
-/**
- * A {@link DataModel} backed by a MongoDB database. This class expects a
- * collection in the database which contains a user ID ({@code long} or
- * {@link ObjectId}), item ID ({@code long} or
- * {@link ObjectId}), preference value (optional) and timestamps
- * ("created_at", "deleted_at").
- *
- * An example of a document in MongoDB:
- *
- * {@code { "_id" : ObjectId("4d7627bf6c7d47ade9fc7780"),
- * "user_id" : ObjectId("4c2209fef3924d31102bd84b"),
- * "item_id" : ObjectId(4c2209fef3924d31202bd853),
- * "preference" : 0.5,
- * "created_at" : "Tue Mar 23 2010 20:48:43 GMT-0400 (EDT)" }
- * }
- *
- * Preference value is optional to accommodate applications that have no notion
- * of a preference value (that is, the user simply expresses a preference for
- * an item, but no degree of preference).
- *
- * The preference value is assumed to be parseable as a {@code double}.
- *
- * The user IDs and item IDs are assumed to be parseable as {@code long}s
- * or {@link ObjectId}s. In case of {@link ObjectId}s, the
- * model creates a {@code Map<ObjectId>}, {@code long}>
- * (collection "mongo_data_model_map") inside the MongoDB database. This
- * conversion is needed since Mahout uses the long datatype to feed the
- * recommender, and MongoDB uses 12 bytes to create its identifiers.
- *
- * The timestamps ("created_at", "deleted_at"), if present, are assumed to be
- * parseable as a {@code long} or {@link Date}. To express
- * timestamps as {@link Date}s, a {@link DateFormat}
- * must be provided in the class constructor. The default Date format is
- * {@code "EE MMM dd yyyy HH:mm:ss 'GMT'Z (zzz)"}. If this parameter
- * is set to null, timestamps are assumed to be parseable as {@code long}s.
- * 
- *
- * It is also acceptable for the documents to contain additional fields.
- * Those fields will be ignored.
- *
- * This class will reload data from the MondoDB database when
- * {@link #refresh(Collection)} is called. MongoDBDataModel keeps the
- * timestamp of the last update. This variable and the fields "created_at"
- * and "deleted_at" help the model to determine if the triple
- * (user, item, preference) must be added or deleted.
- */
-public final class MongoDBDataModel implements DataModel {
-
- private static final Logger log = LoggerFactory.getLogger(MongoDBDataModel.class);
-
- /** Default MongoDB host. Default: localhost */
- private static final String DEFAULT_MONGO_HOST = "localhost";
-
- /** Default MongoDB port. Default: 27017 */
- private static final int DEFAULT_MONGO_PORT = 27017;
-
- /** Default MongoDB database. Default: recommender */
- private static final String DEFAULT_MONGO_DB = "recommender";
-
- /**
- * Default MongoDB authentication flag.
- * Default: false (authentication is not required)
- */
- private static final boolean DEFAULT_MONGO_AUTH = false;
-
- /** Default MongoDB user. Default: recommender */
- private static final String DEFAULT_MONGO_USERNAME = "recommender";
-
- /** Default MongoDB password. Default: recommender */
- private static final String DEFAULT_MONGO_PASSWORD = "recommender";
-
- /** Default MongoDB table/collection. Default: items */
- private static final String DEFAULT_MONGO_COLLECTION = "items";
-
- /**
- * Default MongoDB update flag. When this flag is activated, the
- * DataModel updates both model and database. Default: true
- */
- private static final boolean DEFAULT_MONGO_MANAGE = true;
-
- /** Default MongoDB user ID field. Default: user_id */
- private static final String DEFAULT_MONGO_USER_ID = "user_id";
-
- /** Default MongoDB item ID field. Default: item_id */
- private static final String DEFAULT_MONGO_ITEM_ID = "item_id";
-
- /** Default MongoDB preference value field. Default: preference */
- private static final String DEFAULT_MONGO_PREFERENCE = "preference";
-
- /** Default MongoDB final remove flag. Default: false */
- private static final boolean DEFAULT_MONGO_FINAL_REMOVE = false;
-
- /**
- * Default MongoDB date format.
- * Default: "EE MMM dd yyyy HH:mm:ss 'GMT'Z (zzz)"
- */
- private static final DateFormat DEFAULT_DATE_FORMAT =
- new SimpleDateFormat("EE MMM dd yyyy HH:mm:ss 'GMT'Z (zzz)", Locale.ENGLISH);
-
- public static final String DEFAULT_MONGO_MAP_COLLECTION = "mongo_data_model_map";
-
- private static final Pattern ID_PATTERN = Pattern.compile("[a-f0-9]{24}");
-
- /** MongoDB host */
- private String mongoHost = DEFAULT_MONGO_HOST;
- /** MongoDB port */
- private int mongoPort = DEFAULT_MONGO_PORT;
- /** MongoDB database */
- private String mongoDB = DEFAULT_MONGO_DB;
- /**
- * MongoDB authentication flag. If this flag is set to false,
- * authentication is not required.
- */
- private boolean mongoAuth = DEFAULT_MONGO_AUTH;
- /** MongoDB user */
- private String mongoUsername = DEFAULT_MONGO_USERNAME;
- /** MongoDB pass */
- private String mongoPassword = DEFAULT_MONGO_PASSWORD;
- /** MongoDB table/collection */
- private String mongoCollection = DEFAULT_MONGO_COLLECTION;
- /** MongoDB mapping table/collection */
- private String mongoMapCollection = DEFAULT_MONGO_MAP_COLLECTION;
- /**
- * MongoDB update flag. When this flag is activated, the
- * DataModel updates both model and database
- */
- private boolean mongoManage = DEFAULT_MONGO_MANAGE;
- /** MongoDB user ID field */
- private String mongoUserID = DEFAULT_MONGO_USER_ID;
- /** MongoDB item ID field */
- private String mongoItemID = DEFAULT_MONGO_ITEM_ID;
- /** MongoDB preference value field */
- private String mongoPreference = DEFAULT_MONGO_PREFERENCE;
- /** MongoDB final remove flag. Default: false */
- private boolean mongoFinalRemove = DEFAULT_MONGO_FINAL_REMOVE;
- /** MongoDB date format */
- private DateFormat dateFormat = DEFAULT_DATE_FORMAT;
- private DBCollection collection;
- private DBCollection collectionMap;
- private Date mongoTimestamp;
- private final ReentrantLock reloadLock;
- private DataModel delegate;
- private boolean userIsObject;
- private boolean itemIsObject;
- private boolean preferenceIsString;
- private long idCounter;
-
- /**
- * Creates a new MongoDBDataModel
- */
- public MongoDBDataModel() throws UnknownHostException {
- this.reloadLock = new ReentrantLock();
- buildModel();
- }
-
- /**
- * Creates a new MongoDBDataModel with MongoDB basic configuration
- * (without authentication)
- *
- * @param host MongoDB host.
- * @param port MongoDB port. Default: 27017
- * @param database MongoDB database
- * @param collection MongoDB collection/table
- * @param manage If true, the model adds and removes users and items
- * from MongoDB database when the model is refreshed.
- * @param finalRemove If true, the model removes the user/item completely
- * from the MongoDB database. If false, the model adds the "deleted_at"
- * field with the current date to the "deleted" user/item.
- * @param format MongoDB date format. If null, the model uses timestamps.
- * @throws UnknownHostException if the database host cannot be resolved
- */
- public MongoDBDataModel(String host,
- int port,
- String database,
- String collection,
- boolean manage,
- boolean finalRemove,
- DateFormat format) throws UnknownHostException {
- mongoHost = host;
- mongoPort = port;
- mongoDB = database;
- mongoCollection = collection;
- mongoManage = manage;
- mongoFinalRemove = finalRemove;
- dateFormat = format;
- this.reloadLock = new ReentrantLock();
- buildModel();
- }
-
- /**
- * Creates a new MongoDBDataModel with MongoDB advanced configuration
- * (without authentication)
- *
- * @param userIDField Mongo user ID field
- * @param itemIDField Mongo item ID field
- * @param preferenceField Mongo preference value field
- * @throws UnknownHostException if the database host cannot be resolved
- * @see #MongoDBDataModel(String, int, String, String, boolean, boolean, DateFormat)
- */
- public MongoDBDataModel(String host,
- int port,
- String database,
- String collection,
- boolean manage,
- boolean finalRemove,
- DateFormat format,
- String userIDField,
- String itemIDField,
- String preferenceField,
- String mappingCollection) throws UnknownHostException {
- mongoHost = host;
- mongoPort = port;
- mongoDB = database;
- mongoCollection = collection;
- mongoManage = manage;
- mongoFinalRemove = finalRemove;
- dateFormat = format;
- mongoUserID = userIDField;
- mongoItemID = itemIDField;
- mongoPreference = preferenceField;
- mongoMapCollection = mappingCollection;
- this.reloadLock = new ReentrantLock();
- buildModel();
- }
-
- /**
- * Creates a new MongoDBDataModel with MongoDB basic configuration
- * (with authentication)
- *
- * @param user Mongo username (authentication)
- * @param password Mongo password (authentication)
- * @throws UnknownHostException if the database host cannot be resolved
- * @see #MongoDBDataModel(String, int, String, String, boolean, boolean, DateFormat)
- */
- public MongoDBDataModel(String host,
- int port,
- String database,
- String collection,
- boolean manage,
- boolean finalRemove,
- DateFormat format,
- String user,
- String password) throws UnknownHostException {
- mongoHost = host;
- mongoPort = port;
- mongoDB = database;
- mongoCollection = collection;
- mongoManage = manage;
- mongoFinalRemove = finalRemove;
- dateFormat = format;
- mongoAuth = true;
- mongoUsername = user;
- mongoPassword = password;
- this.reloadLock = new ReentrantLock();
- buildModel();
- }
-
- /**
- * Creates a new MongoDBDataModel with MongoDB advanced configuration
- * (with authentication)
- *
- * @throws UnknownHostException if the database host cannot be resolved
- * @see #MongoDBDataModel(String, int, String, String, boolean, boolean, DateFormat, String, String)
- */
- public MongoDBDataModel(String host,
- int port,
- String database,
- String collection,
- boolean manage,
- boolean finalRemove,
- DateFormat format,
- String user,
- String password,
- String userIDField,
- String itemIDField,
- String preferenceField,
- String mappingCollection) throws UnknownHostException {
- mongoHost = host;
- mongoPort = port;
- mongoDB = database;
- mongoCollection = collection;
- mongoManage = manage;
- mongoFinalRemove = finalRemove;
- dateFormat = format;
- mongoAuth = true;
- mongoUsername = user;
- mongoPassword = password;
- mongoUserID = userIDField;
- mongoItemID = itemIDField;
- mongoPreference = preferenceField;
- mongoMapCollection = mappingCollection;
- this.reloadLock = new ReentrantLock();
- buildModel();
- }
-
- /**
- * 
- * Adds/removes (user, item) pairs to/from the model.
- * 
- *
- * @param userID MongoDB user identifier
- * @param items List of pairs (item, preference) which want to be added or
- * deleted
- * @param add If true, this flag indicates that the pairs (user, item)
- * must be added to the model. If false, it indicates deletion.
- * @see #refresh(Collection)
- */
- public void refreshData(String userID,
- Iterable<List<String>> items,
- boolean add) throws NoSuchUserException, NoSuchItemException {
- checkData(userID, items, add);
- long id = Long.parseLong(fromIdToLong(userID, true));
- for (List<String> item : items) {
- item.set(0, fromIdToLong(item.get(0), false));
- }
- if (reloadLock.tryLock()) {
- try {
- if (add) {
- delegate = addUserItem(id, items);
- } else {
- delegate = removeUserItem(id, items);
- }
- } finally {
- reloadLock.unlock();
- }
- }
- }
-
-
- /**
- * 
- * Triggers "refresh" -- whatever that means -- of the implementation.
- * The general contract is that any should always leave itself in a
- * consistent, operational state, and that the refresh atomically updates
- * internal state from old to new.
- * 
- *
- * @param alreadyRefreshed s that are known to have already been refreshed as
- * a result of an initial call to a method on some object. This ensures
- * that objects in a refresh dependency graph aren't refreshed twice
- * needlessly.
- * @see #refreshData(String, Iterable, boolean)
- */
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- BasicDBObject query = new BasicDBObject();
- query.put("deleted_at", new BasicDBObject("$gt", mongoTimestamp));
- DBCursor cursor = collection.find(query);
- Date ts = new Date(0);
- while (cursor.hasNext()) {
- Map<String,Object> user = (Map<String,Object>) cursor.next().toMap();
- String userID = getID(user.get(mongoUserID), true);
- Collection<List<String>> items = new ArrayList<>();
- List<String> item = new ArrayList<>();
- item.add(getID(user.get(mongoItemID), false));
- item.add(Float.toString(getPreference(user.get(mongoPreference))));
- items.add(item);
- try {
- refreshData(userID, items, false);
- } catch (NoSuchUserException e) {
- log.warn("No such user ID: {}", userID);
- } catch (NoSuchItemException e) {
- log.warn("No such items: {}", items);
- }
- if (ts.compareTo(getDate(user.get("created_at"))) < 0) {
- ts = getDate(user.get("created_at"));
- }
- }
- query = new BasicDBObject();
- query.put("created_at", new BasicDBObject("$gt", mongoTimestamp));
- cursor = collection.find(query);
- while (cursor.hasNext()) {
- Map<String,Object> user = (Map<String,Object>) cursor.next().toMap();
- if (!user.containsKey("deleted_at")) {
- String userID = getID(user.get(mongoUserID), true);
- Collection<List<String>> items = new ArrayList<>();
- List<String> item = new ArrayList<>();
- item.add(getID(user.get(mongoItemID), false));
- item.add(Float.toString(getPreference(user.get(mongoPreference))));
- items.add(item);
- try {
- refreshData(userID, items, true);
- } catch (NoSuchUserException e) {
- log.warn("No such user ID: {}", userID);
- } catch (NoSuchItemException e) {
- log.warn("No such items: {}", items);
- }
- if (ts.compareTo(getDate(user.get("created_at"))) < 0) {
- ts = getDate(user.get("created_at"));
- }
- }
- }
- if (mongoTimestamp.compareTo(ts) < 0) {
- mongoTimestamp = ts;
- }
- }
-
- /**
- * 
- * Translates the MongoDB identifier to Mahout/MongoDBDataModel's internal
- * identifier, if required.
- * 
- * 
- * If MongoDB identifiers are long datatypes, it returns the id.
- * 
- * 
- * This conversion is needed since Mahout uses the long datatype to feed the
- * recommender, and MongoDB uses 12 bytes to create its identifiers.
- * 
- *
- * @param id MongoDB identifier
- * @param isUser
- * @return String containing the translation of the external MongoDB ID to
- * internal long ID (mapping).
- * @see #fromLongToId(long)
- * @see <a href="http://www.mongodb.org/display/DOCS/Object%20IDs">
- * Mongo Object IDs</a>
- */
- public String fromIdToLong(String id, boolean isUser) {
- DBObject objectIdLong = collectionMap.findOne(new BasicDBObject("element_id", id));
- if (objectIdLong != null) {
- Map<String,Object> idLong = (Map<String,Object>) objectIdLong.toMap();
- Object value = idLong.get("long_value");
- return value == null ? null : value.toString();
- } else {
- objectIdLong = new BasicDBObject();
- String longValue = Long.toString(idCounter++);
- objectIdLong.put("element_id", id);
- objectIdLong.put("long_value", longValue);
- collectionMap.insert(objectIdLong);
- log.info("Adding Translation {}: {} long_value: {}",
- isUser ? "User ID" : "Item ID", id, longValue);
- return longValue;
- }
- }
-
- /**
- * 
- * Translates the Mahout/MongoDBDataModel's internal identifier to MongoDB
- * identifier, if required.
- * 
- * 
- * If MongoDB identifiers are long datatypes, it returns the id in String
- * format.
- * 
- * 
- * This conversion is needed since Mahout uses the long datatype to feed the
- * recommender, and MongoDB uses 12 bytes to create its identifiers.
- * 
- *
- * @param id Mahout's internal identifier
- * @return String containing the translation of the internal long ID to
- * external MongoDB ID (mapping).
- * @see #fromIdToLong(String, boolean)
- * @see <a href="http://www.mongodb.org/display/DOCS/Object%20IDs">
- * Mongo Object IDs</a>
- */
- public String fromLongToId(long id) {
- DBObject objectIdLong = collectionMap.findOne(new BasicDBObject("long_value", Long.toString(id)));
- Map<String,Object> idLong = (Map<String,Object>) objectIdLong.toMap();
- Object value = idLong.get("element_id");
- return value == null ? null : value.toString();
- }
-
- /**
- * 
- * Checks if an ID is currently in the model.
- * 
- *
- * @param ID user or item ID
- * @return true: if ID is into the model; false: if it's not.
- */
- public boolean isIDInModel(String ID) {
- DBObject objectIdLong = collectionMap.findOne(new BasicDBObject("element_id", ID));
- return objectIdLong != null;
- }
-
- /**
- * 
- * Date of the latest update of the model.
- * 
- *
- * @return Date with the latest update of the model.
- */
- public Date mongoUpdateDate() {
- return mongoTimestamp;
- }
-
- private void buildModel() throws UnknownHostException {
- userIsObject = false;
- itemIsObject = false;
- idCounter = 0;
- preferenceIsString = true;
- Mongo mongoDDBB = new Mongo(mongoHost, mongoPort);
- DB db = mongoDDBB.getDB(mongoDB);
- mongoTimestamp = new Date(0);
- FastByIDMap<Collection<Preference>> userIDPrefMap = new FastByIDMap<>();
- if (!mongoAuth || db.authenticate(mongoUsername, mongoPassword.toCharArray())) {
- collection = db.getCollection(mongoCollection);
- collectionMap = db.getCollection(mongoMapCollection);
- DBObject indexObj = new BasicDBObject();
- indexObj.put("element_id", 1);
- collectionMap.ensureIndex(indexObj);
- indexObj = new BasicDBObject();
- indexObj.put("long_value", 1);
- collectionMap.ensureIndex(indexObj);
- collectionMap.remove(new BasicDBObject());
- DBCursor cursor = collection.find();
- while (cursor.hasNext()) {
- Map<String,Object> user = (Map<String,Object>) cursor.next().toMap();
- if (!user.containsKey("deleted_at")) {
- long userID = Long.parseLong(fromIdToLong(getID(user.get(mongoUserID), true), true));
- long itemID = Long.parseLong(fromIdToLong(getID(user.get(mongoItemID), false), false));
- float ratingValue = getPreference(user.get(mongoPreference));
- Collection<Preference> userPrefs = userIDPrefMap.get(userID);
- if (userPrefs == null) {
- userPrefs = new ArrayList<>(2);
- userIDPrefMap.put(userID, userPrefs);
- }
- userPrefs.add(new GenericPreference(userID, itemID, ratingValue));
- if (user.containsKey("created_at")
- && mongoTimestamp.compareTo(getDate(user.get("created_at"))) < 0) {
- mongoTimestamp = getDate(user.get("created_at"));
- }
- }
- }
- }
- delegate = new GenericDataModel(GenericDataModel.toDataMap(userIDPrefMap, true));
- }
-
- private void removeMongoUserItem(String userID, String itemID) {
- String userId = fromLongToId(Long.parseLong(userID));
- String itemId = fromLongToId(Long.parseLong(itemID));
- if (isUserItemInDB(userId, itemId)) {
- mongoTimestamp = new Date();
- BasicDBObject query = new BasicDBObject();
- query.put(mongoUserID, userIsObject ? new ObjectId(userId) : userId);
- query.put(mongoItemID, itemIsObject ? new ObjectId(itemId) : itemId);
- if (mongoFinalRemove) {
- log.info(collection.remove(query).toString());
- } else {
- BasicDBObject update = new BasicDBObject();
- update.put("$set", new BasicDBObject("deleted_at", mongoTimestamp));
- log.info(collection.update(query, update).toString());
- }
- log.info("Removing userID: {} itemID: {}", userID, itemId);
- }
- }
-
- private void addMongoUserItem(String userID, String itemID, String preferenceValue) {
- String userId = fromLongToId(Long.parseLong(userID));
- String itemId = fromLongToId(Long.parseLong(itemID));
- if (!isUserItemInDB(userId, itemId)) {
- mongoTimestamp = new Date();
- BasicDBObject user = new BasicDBObject();
- Object userIdObject = userIsObject ? new ObjectId(userId) : userId;
- Object itemIdObject = itemIsObject ? new ObjectId(itemId) : itemId;
- user.put(mongoUserID, userIdObject);
- user.put(mongoItemID, itemIdObject);
- user.put(mongoPreference, preferenceIsString ? preferenceValue : Double.parseDouble(preferenceValue));
- user.put("created_at", mongoTimestamp);
- collection.insert(user);
- log.info("Adding userID: {} itemID: {} preferenceValue: {}", userID, itemID, preferenceValue);
- }
- }
-
- private boolean isUserItemInDB(String userID, String itemID) {
- BasicDBObject query = new BasicDBObject();
- Object userId = userIsObject ? new ObjectId(userID) : userID;
- Object itemId = itemIsObject ? new ObjectId(itemID) : itemID;
- query.put(mongoUserID, userId);
- query.put(mongoItemID, itemId);
- return collection.findOne(query) != null;
- }
-
- private DataModel removeUserItem(long userID, Iterable<List<String>> items) {
- FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();
- for (List<String> item : items) {
- PreferenceArray prefs = rawData.get(userID);
- long itemID = Long.parseLong(item.get(0));
- if (prefs != null) {
- boolean exists = false;
- int length = prefs.length();
- for (int i = 0; i < length; i++) {
- if (prefs.getItemID(i) == itemID) {
- exists = true;
- break;
- }
- }
- if (exists) {
- rawData.remove(userID);
- if (length > 1) {
- PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1);
- for (int i = 0, j = 0; i < length; i++, j++) {
- if (prefs.getItemID(i) == itemID) {
- j--;
- } else {
- newPrefs.set(j, prefs.get(i));
- }
- }
- rawData.put(userID, newPrefs);
- }
- log.info("Removing userID: {} itemID: {}", userID, itemID);
- if (mongoManage) {
- removeMongoUserItem(Long.toString(userID), Long.toString(itemID));
- }
- }
- }
- }
- return new GenericDataModel(rawData);
- }
-
- private DataModel addUserItem(long userID, Iterable<List<String>> items) {
- FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();
- PreferenceArray prefs = rawData.get(userID);
- for (List<String> item : items) {
- long itemID = Long.parseLong(item.get(0));
- float preferenceValue = Float.parseFloat(item.get(1));
- boolean exists = false;
- if (prefs != null) {
- for (int i = 0; i < prefs.length(); i++) {
- if (prefs.getItemID(i) == itemID) {
- exists = true;
- prefs.setValue(i, preferenceValue);
- break;
- }
- }
- }
- if (!exists) {
- if (prefs == null) {
- prefs = new GenericUserPreferenceArray(1);
- } else {
- PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1);
- for (int i = 0, j = 1; i < prefs.length(); i++, j++) {
- newPrefs.set(j, prefs.get(i));
- }
- prefs = newPrefs;
- }
- prefs.setUserID(0, userID);
- prefs.setItemID(0, itemID);
- prefs.setValue(0, preferenceValue);
- log.info("Adding userID: {} itemID: {} preferenceValue: {}", userID, itemID, preferenceValue);
- rawData.put(userID, prefs);
- if (mongoManage) {
- addMongoUserItem(Long.toString(userID),
- Long.toString(itemID),
- Float.toString(preferenceValue));
- }
- }
- }
- return new GenericDataModel(rawData);
- }
-
- private Date getDate(Object date) {
- if (date.getClass().getName().contains("Date")) {
- return (Date) date;
- }
- if (date.getClass().getName().contains("String")) {
- try {
- synchronized (dateFormat) {
- return dateFormat.parse(date.toString());
- }
- } catch (ParseException ioe) {
- log.warn("Error parsing timestamp", ioe);
- }
- }
- return new Date(0);
- }
-
- private float getPreference(Object value) {
- if (value != null) {
- if (value.getClass().getName().contains("String")) {
- preferenceIsString = true;
- return Float.parseFloat(value.toString());
- } else {
- preferenceIsString = false;
- return Double.valueOf(value.toString()).floatValue();
- }
- } else {
- return 0.5f;
- }
- }
-
- private String getID(Object id, boolean isUser) {
- if (id.getClass().getName().contains("ObjectId")) {
- if (isUser) {
- userIsObject = true;
- } else {
- itemIsObject = true;
- }
- return ((ObjectId) id).toStringMongod();
- } else {
- return id.toString();
- }
- }
-
- private void checkData(String userID,
- Iterable<List<String>> items,
- boolean add) throws NoSuchUserException, NoSuchItemException {
- Preconditions.checkNotNull(userID);
- Preconditions.checkNotNull(items);
- Preconditions.checkArgument(!userID.isEmpty(), "userID is empty");
- for (List<String> item : items) {
- Preconditions.checkNotNull(item.get(0));
- Preconditions.checkArgument(!item.get(0).isEmpty(), "item is empty");
- }
- if (userIsObject && !ID_PATTERN.matcher(userID).matches()) {
- throw new IllegalArgumentException();
- }
- for (List<String> item : items) {
- if (itemIsObject && !ID_PATTERN.matcher(item.get(0)).matches()) {
- throw new IllegalArgumentException();
- }
- }
- if (!add && !isIDInModel(userID)) {
- throw new NoSuchUserException();
- }
- for (List<String> item : items) {
- if (!add && !isIDInModel(item.get(0))) {
- throw new NoSuchItemException();
- }
- }
- }
-
- /**
- * Cleanup mapping collection.
- */
- public void cleanupMappingCollection() {
- collectionMap.drop();
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() throws TasteException {
- return delegate.getUserIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesFromUser(long id) throws TasteException {
- return delegate.getPreferencesFromUser(id);
- }
-
- @Override
- public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
- return delegate.getItemIDsFromUser(userID);
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() throws TasteException {
- return delegate.getItemIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
- return delegate.getPreferencesForItem(itemID);
- }
-
- @Override
- public Float getPreferenceValue(long userID, long itemID) throws TasteException {
- return delegate.getPreferenceValue(userID, itemID);
- }
-
- @Override
- public Long getPreferenceTime(long userID, long itemID) throws TasteException {
- return delegate.getPreferenceTime(userID, itemID);
- }
-
- @Override
- public int getNumItems() throws TasteException {
- return delegate.getNumItems();
- }
-
- @Override
- public int getNumUsers() throws TasteException {
- return delegate.getNumUsers();
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
- return delegate.getNumUsersWithPreferenceFor(itemID);
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
- return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void removePreference(long userID, long itemID) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean hasPreferenceValues() {
- return delegate.hasPreferenceValues();
- }
-
- @Override
- public float getMaxPreference() {
- return delegate.getMaxPreference();
- }
-
- @Override
- public float getMinPreference() {
- return delegate.getMinPreference();
- }
-
- @Override
- public String toString() {
- return "MongoDBDataModel";
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java
deleted file mode 100644
index 3ae9990..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-import org.apache.mahout.cf.taste.impl.common.jdbc.ResultSetIterator;
-import org.apache.mahout.cf.taste.impl.model.jdbc.ConnectionPoolDataSource;
-import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.sql.DataSource;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.concurrent.locks.ReentrantLock;
-
-/**
- * loads all similarities from the database into RAM
- */
-abstract class AbstractJDBCInMemoryItemSimilarity extends AbstractJDBCComponent implements ItemSimilarity {
-
- private ItemSimilarity delegate;
-
- private final DataSource dataSource;
- private final String getAllItemSimilaritiesSQL;
- private final ReentrantLock reloadLock;
-
- private static final Logger log = LoggerFactory.getLogger(AbstractJDBCInMemoryItemSimilarity.class);
-
- AbstractJDBCInMemoryItemSimilarity(DataSource dataSource, String getAllItemSimilaritiesSQL) {
-
- AbstractJDBCComponent.checkNotNullAndLog("getAllItemSimilaritiesSQL", getAllItemSimilaritiesSQL);
-
- if (!(dataSource instanceof ConnectionPoolDataSource)) {
- log.warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
- + "to the database itself, or database performance will be severely reduced.");
- }
-
- this.dataSource = dataSource;
- this.getAllItemSimilaritiesSQL = getAllItemSimilaritiesSQL;
- this.reloadLock = new ReentrantLock();
-
- reload();
- }
-
- @Override
- public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
- return delegate.itemSimilarity(itemID1, itemID2);
- }
-
- @Override
- public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
- return delegate.itemSimilarities(itemID1, itemID2s);
- }
-
- @Override
- public long[] allSimilarItemIDs(long itemID) throws TasteException {
- return delegate.allSimilarItemIDs(itemID);
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- log.debug("Reloading...");
- reload();
- }
-
- protected void reload() {
- if (reloadLock.tryLock()) {
- try {
- delegate = new GenericItemSimilarity(new JDBCSimilaritiesIterable(dataSource, getAllItemSimilaritiesSQL));
- } finally {
- reloadLock.unlock();
- }
- }
- }
-
- private static final class JDBCSimilaritiesIterable implements Iterable<GenericItemSimilarity.ItemItemSimilarity> {
-
- private final DataSource dataSource;
- private final String getAllItemSimilaritiesSQL;
-
- private JDBCSimilaritiesIterable(DataSource dataSource, String getAllItemSimilaritiesSQL) {
- this.dataSource = dataSource;
- this.getAllItemSimilaritiesSQL = getAllItemSimilaritiesSQL;
- }
-
- @Override
- public Iterator<GenericItemSimilarity.ItemItemSimilarity> iterator() {
- try {
- return new JDBCSimilaritiesIterator(dataSource, getAllItemSimilaritiesSQL);
- } catch (SQLException sqle) {
- throw new IllegalStateException(sqle);
- }
- }
- }
-
- private static final class JDBCSimilaritiesIterator
- extends ResultSetIterator<GenericItemSimilarity.ItemItemSimilarity> {
-
- private JDBCSimilaritiesIterator(DataSource dataSource, String getAllItemSimilaritiesSQL) throws SQLException {
- super(dataSource, getAllItemSimilaritiesSQL);
- }
-
- @Override
- protected GenericItemSimilarity.ItemItemSimilarity parseElement(ResultSet resultSet) throws SQLException {
- return new GenericItemSimilarity.ItemItemSimilarity(resultSet.getLong(1),
- resultSet.getLong(2),
- resultSet.getDouble(3));
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java
deleted file mode 100644
index 1b8d109..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.Collection;
-
-import javax.sql.DataSource;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-import org.apache.mahout.cf.taste.impl.model.jdbc.ConnectionPoolDataSource;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.apache.mahout.common.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * An {@link ItemSimilarity} which draws pre-computed item-item similarities from a database table via JDBC.
- */
-public abstract class AbstractJDBCItemSimilarity extends AbstractJDBCComponent implements ItemSimilarity {
-
- private static final Logger log = LoggerFactory.getLogger(AbstractJDBCItemSimilarity.class);
-
- static final String DEFAULT_SIMILARITY_TABLE = "taste_item_similarity";
- static final String DEFAULT_ITEM_A_ID_COLUMN = "item_id_a";
- static final String DEFAULT_ITEM_B_ID_COLUMN = "item_id_b";
- static final String DEFAULT_SIMILARITY_COLUMN = "similarity";
-
- private final DataSource dataSource;
- private final String similarityTable;
- private final String itemAIDColumn;
- private final String itemBIDColumn;
- private final String similarityColumn;
- private final String getItemItemSimilaritySQL;
- private final String getAllSimilarItemIDsSQL;
-
- protected AbstractJDBCItemSimilarity(DataSource dataSource,
- String getItemItemSimilaritySQL,
- String getAllSimilarItemIDsSQL) {
- this(dataSource,
- DEFAULT_SIMILARITY_TABLE,
- DEFAULT_ITEM_A_ID_COLUMN,
- DEFAULT_ITEM_B_ID_COLUMN,
- DEFAULT_SIMILARITY_COLUMN,
- getItemItemSimilaritySQL,
- getAllSimilarItemIDsSQL);
- }
-
- protected AbstractJDBCItemSimilarity(DataSource dataSource,
- String similarityTable,
- String itemAIDColumn,
- String itemBIDColumn,
- String similarityColumn,
- String getItemItemSimilaritySQL,
- String getAllSimilarItemIDsSQL) {
- AbstractJDBCComponent.checkNotNullAndLog("similarityTable", similarityTable);
- AbstractJDBCComponent.checkNotNullAndLog("itemAIDColumn", itemAIDColumn);
- AbstractJDBCComponent.checkNotNullAndLog("itemBIDColumn", itemBIDColumn);
- AbstractJDBCComponent.checkNotNullAndLog("similarityColumn", similarityColumn);
-
- AbstractJDBCComponent.checkNotNullAndLog("getItemItemSimilaritySQL", getItemItemSimilaritySQL);
- AbstractJDBCComponent.checkNotNullAndLog("getAllSimilarItemIDsSQL", getAllSimilarItemIDsSQL);
-
- if (!(dataSource instanceof ConnectionPoolDataSource)) {
- log.warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
- + "to the database itself, or database performance will be severely reduced.");
- }
-
- this.dataSource = dataSource;
- this.similarityTable = similarityTable;
- this.itemAIDColumn = itemAIDColumn;
- this.itemBIDColumn = itemBIDColumn;
- this.similarityColumn = similarityColumn;
- this.getItemItemSimilaritySQL = getItemItemSimilaritySQL;
- this.getAllSimilarItemIDsSQL = getAllSimilarItemIDsSQL;
- }
-
- protected String getSimilarityTable() {
- return similarityTable;
- }
-
- protected String getItemAIDColumn() {
- return itemAIDColumn;
- }
-
- protected String getItemBIDColumn() {
- return itemBIDColumn;
- }
-
- protected String getSimilarityColumn() {
- return similarityColumn;
- }
-
- @Override
- public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
- if (itemID1 == itemID2) {
- return 1.0;
- }
- Connection conn = null;
- PreparedStatement stmt = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getItemItemSimilaritySQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
- return doItemSimilarity(stmt, itemID1, itemID2);
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving similarity", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(null, stmt, conn);
- }
- }
-
- @Override
- public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
- double[] result = new double[itemID2s.length];
- Connection conn = null;
- PreparedStatement stmt = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getItemItemSimilaritySQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
- for (int i = 0; i < itemID2s.length; i++) {
- result[i] = doItemSimilarity(stmt, itemID1, itemID2s[i]);
- }
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving item similarities", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(null, stmt, conn);
- }
- return result;
- }
-
- @Override
- public long[] allSimilarItemIDs(long itemID) throws TasteException {
- FastIDSet allSimilarItemIDs = new FastIDSet();
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getAllSimilarItemIDsSQL, ResultSet.TYPE_FORWARD_ONLY,
- ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
- stmt.setLong(1, itemID);
- stmt.setLong(2, itemID);
- rs = stmt.executeQuery();
- while (rs.next()) {
- allSimilarItemIDs.add(rs.getLong(1));
- allSimilarItemIDs.add(rs.getLong(2));
- }
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving all similar itemIDs", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
- allSimilarItemIDs.remove(itemID);
- return allSimilarItemIDs.toArray();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- // do nothing
- }
-
- private double doItemSimilarity(PreparedStatement stmt, long itemID1, long itemID2) throws SQLException {
- // Order as smaller - larger
- if (itemID1 > itemID2) {
- long temp = itemID1;
- itemID1 = itemID2;
- itemID2 = temp;
- }
- stmt.setLong(1, itemID1);
- stmt.setLong(2, itemID2);
- log.debug("Executing SQL query: {}", getItemItemSimilaritySQL);
- ResultSet rs = null;
- try {
- rs = stmt.executeQuery();
- // If not found, perhaps the items exist but have no presence in the table,
- // so NaN is appropriate
- return rs.next() ? rs.getDouble(1) : Double.NaN;
- } finally {
- IOUtils.quietClose(rs);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java
deleted file mode 100644
index cc831d9..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-
-import javax.sql.DataSource;
-
-public class MySQLJDBCInMemoryItemSimilarity extends SQL92JDBCInMemoryItemSimilarity {
-
- public MySQLJDBCInMemoryItemSimilarity() throws TasteException {
- }
-
- public MySQLJDBCInMemoryItemSimilarity(String dataSourceName) throws TasteException {
- super(dataSourceName);
- }
-
- public MySQLJDBCInMemoryItemSimilarity(DataSource dataSource) {
- super(dataSource);
- }
-
- public MySQLJDBCInMemoryItemSimilarity(DataSource dataSource, String getAllItemSimilaritiesSQL) {
- super(dataSource, getAllItemSimilaritiesSQL);
- }
-
- @Override
- protected int getFetchSize() {
- // Need to return this for MySQL Connector/J to make it use streaming mode
- return Integer.MIN_VALUE;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java
deleted file mode 100644
index af0742e..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import javax.sql.DataSource;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-
-/**
- * 
- * An {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} backed by a MySQL database
- * and accessed via JDBC. It may work with other JDBC
- * databases. By default, this class assumes that there is a {@link DataSource} available under the JNDI name
- * "jdbc/taste", which gives access to a database with a "taste_item_similarity" table with the following
- * schema:
- * 
- *
- * <table>
- * <tr>
- * <th>item_id_a</th>
- * <th>item_id_b</th>
- * <th>similarity</th>
- * </tr>
- * <tr>
- * <td>ABC</td>
- * <td>DEF</td>
- * <td>0.9</td>
- * </tr>
- * <tr>
- * <td>DEF</td>
- * <td>EFG</td>
- * <td>0.1</td>
- * </tr>
- * </table>
- *
- * 
- * For example, the following command sets up a suitable table in MySQL, complete with primary key and
- * indexes:
- * 
- *
- * 
- *
- * <pre>
- * CREATE TABLE taste_item_similarity (
- * item_id_a BIGINT NOT NULL,
- * item_id_b BIGINT NOT NULL,
- * similarity FLOAT NOT NULL,
- * PRIMARY KEY (item_id_a, item_id_b),
- * )
- * </pre>
- *
- * 
- *
- * 
- * Note that for each row, item_id_a should be less than item_id_b. It is redundant to store it both ways,
- * so the pair is always stored as a pair with the lesser one first.
- *
- * @see org.apache.mahout.cf.taste.impl.model.jdbc.MySQLJDBCDataModel
- */
-public class MySQLJDBCItemSimilarity extends SQL92JDBCItemSimilarity {
-
- public MySQLJDBCItemSimilarity() throws TasteException {
- }
-
- public MySQLJDBCItemSimilarity(String dataSourceName) throws TasteException {
- super(dataSourceName);
- }
-
- public MySQLJDBCItemSimilarity(DataSource dataSource) {
- super(dataSource);
- }
-
- public MySQLJDBCItemSimilarity(DataSource dataSource,
- String similarityTable,
- String itemAIDColumn,
- String itemBIDColumn,
- String similarityColumn) {
- super(dataSource, similarityTable, itemAIDColumn, itemBIDColumn, similarityColumn);
- }
-
- @Override
- protected int getFetchSize() {
- // Need to return this for MySQL Connector/J to make it use streaming mode
- return Integer.MIN_VALUE;
- }
-
-}
-

r***@apache.org

2018-06-27 14:52:05 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/src/images/logos/mahout-logo.svg
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo.svg b/community/mahout-mr/src/images/logos/mahout-logo.svg
new file mode 100644
index 0000000..374c89d
--- /dev/null
+++ b/community/mahout-mr/src/images/logos/mahout-logo.svg
@@ -0,0 +1,627 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+ width="956px" height="400px" viewBox="0 0 956 400" enable-background="new 0 0 956 400" xml:space="preserve">
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M709.799,389.6c-21.38,0-37.761-6.839-48.688-20.322
+ c-0.377-0.467-0.747-0.935-1.11-1.408V376c0,5.523-4.478,10.001-10.001,10.001h-28.6c-5.522,0-10-4.478-10-10.001v-64.87
+ c0-4.989-0.908-7.693-1.669-9.083c-0.053-0.096-0.104-0.194-0.154-0.292c-0.32-0.634-0.987-1.954-5.366-1.954
+ c-5.29,0-7.384,1.85-8.617,3.464c-2.353,3.07-3.593,8.255-3.593,15.005V376c0,5.523-4.477,10.001-10,10.001h-27.8
+ c-0.756,0-1.492-0.085-2.201-0.244c-0.708,0.159-1.444,0.244-2.2,0.244h-30.271c-3.453,0-6.61-1.776-8.425-4.61
+ c-0.791,0.505-1.595,0.995-2.412,1.471c-7.595,4.351-16.133,6.54-25.442,6.54c-11.384,0-21.145-3.183-29.042-9.469
+ c-1.529,3.569-5.072,6.068-9.198,6.068h-28.408c-5.523,0-10-4.478-10-10.001v-67.812c0-3.194-0.564-4.789-0.9-5.458
+ c-0.392-0.777-0.97-1.93-4.821-1.93c-4.724,0-5.983,1.728-6.896,3.676c-0.919,2.061-1.383,4.79-1.383,8.113V376
+ c0,5.523-4.477,10.001-10,10.001h-27.8c-5.523,0-10-4.478-10-10.001v-63.33c0-6.95-0.88-9.239-1.055-9.627
+ c-0.351-0.763-0.845-1.844-4.675-1.844c-5.691,0-6.793,1.673-7.148,2.329c-0.298,0.616-1.122,2.832-1.122,8.451V376
+ c0,5.523-4.477,10.001-10,10.001h-28.199c-5.523,0-10-4.478-10-10.001V269.8c0-5.522,4.477-10,10-10h26.999
+ c2.902,0,5.514,1.235,7.34,3.209c6.486-3.852,14.321-5.809,23.34-5.809c10.216,0,18.796,2.437,25.504,7.242
+ c0.185,0.133,0.368,0.272,0.545,0.419c1.322,1.091,2.566,2.261,3.73,3.505c2.438-2.188,5.07-4.048,7.884-5.57
+ c0.07-0.037,0.14-0.074,0.211-0.111c7.126-3.639,15.103-5.484,23.707-5.484c5.958,0,11.882,1.164,17.608,3.456
+ c6.131,2.448,11.667,6.673,16.449,12.554c1.573,1.945,2.946,4.052,4.116,6.312c0.939-1.602,1.974-3.131,3.1-4.586
+ C462.511,263.016,477.94,257,499.041,257c13.235,0,25.249,2.715,35.706,8.067c3.12,1.598,6.458,3.872,9.454,7.101v-39.569
+ c0-5.522,4.477-10,10-10h27.8c5.523,0,10,4.478,10,10v28.484c6.504-2.974,13.447-4.483,20.639-4.483
+ c7.865,0,15.192,1.418,21.774,4.218c7.009,3,12.832,7.628,17.329,13.761c2.014,2.758,3.63,5.599,4.846,8.499
+ c1.368-2.145,2.862-4.229,4.481-6.253c10.92-13.683,27.316-20.624,48.729-20.624c21.414,0,37.812,6.941,48.737,20.633
+ c0.225,0.278,0.444,0.562,0.665,0.843v-8.274c0-5.523,4.477-10,10-10h28.6c5.523,0,10,4.477,10,10v64.358
+ c0,6.407,0.92,8.881,1.203,9.484c0.409,0.88,1.098,2.354,5.816,2.354c6.393,0,8.763-2.237,10.312-5.607
+ c0.86-2.016,1.867-5.809,1.867-12.502v-58.088c0-5.523,4.477-10,10-10h28.201c1.719,0,3.338,0.434,4.749,1.198h2.85v-20.001
+ c0-5.522,4.478-10,10.001-10h27.6c5.522,0,10,4.478,10,10V260.6h7.198c5.523,0,10,4.477,10,10v19.602c0,5.523-4.477,10-10,10H920.4
+ v46.178c0.521,0.013,1.106,0.021,1.76,0.021c0.63,0,1.279-0.023,1.929-0.071c0.704-0.053,1.405-0.129,2.085-0.227
+ c0.475-0.067,0.952-0.103,1.427-0.103c2.388,0,4.717,0.856,6.547,2.442c2.192,1.899,3.451,4.658,3.451,7.558v20.8
+ c0,5.347-4.205,9.745-9.545,9.989l-13.179,0.602c-0.037,0.002-0.076,0.004-0.113,0.004c-1.198,0.042-2.364,0.062-3.501,0.062
+ c-14.403,0-24.539-3.26-30.987-9.963c-2.15-2.205-3.846-4.837-5.072-7.872V376c0,5.523-4.478,10.001-10,10.001H838.2
+ c-3.148,0-5.959-1.456-7.791-3.732c-2.405,1.436-4.804,2.577-7.188,3.416c-5.142,1.804-11.065,2.717-17.621,2.717
+ c-24.711,0-35.835-12.303-40.818-22.626c-0.51-1.045-0.984-2.142-1.422-3.292c-1.476,2.343-3.101,4.608-4.874,6.796
+ C747.562,382.761,731.181,389.6,709.799,389.6L709.799,389.6z M487.944,348.278c0.598,0.447,1.538,0.922,3.414,0.922
+ c4.033,0,7.665-1.15,11.099-3.517c1.935-1.333,2.882-4.174,3.318-7.126c-0.231,0.043-0.465,0.089-0.702,0.133l-6.347,1.172
+ c-6.723,1.191-9.018,2.316-9.562,2.634c-0.961,0.561-1.564,1.024-1.564,3.194C487.601,347.181,487.822,347.995,487.944,348.278
+ L487.944,348.278z M709.751,299.801c-6.414,0-9.15,2.51-10.819,4.697c-3.009,3.937-4.531,10.177-4.531,18.552
+ c0,8.386,1.529,14.651,4.544,18.623c1.671,2.205,4.405,4.728,10.807,4.728c6.375,0,9.085-2.51,10.732-4.697
+ c2.995-3.98,4.517-10.259,4.517-18.653c0-8.384-1.515-14.637-4.504-18.585C718.854,302.298,716.139,299.801,709.751,299.801
+ L709.751,299.801z M491.611,300.711c-0.264,0.336-0.564,0.824-0.854,1.53l7.135-0.876c3.8-0.479,5.996-0.97,7.181-1.303
+ c-1.357-0.336-3.556-0.663-6.974-0.663C493.944,299.399,492.062,300.24,491.611,300.711L491.611,300.711z"/>
+ <path fill="#1F1F1F" d="M582,232.6v50.641c4.02-6.2,8.67-10.52,13.96-12.971c5.28-2.449,10.851-3.67,16.681-3.67
+ c6.549,0,12.5,1.141,17.859,3.42c5.35,2.291,9.74,5.78,13.18,10.471c2.91,3.99,4.7,8.08,5.35,12.289
+ c0.65,4.201,0.971,11.07,0.971,20.601V376h-28.6v-64.87c0-5.739-0.971-10.37-2.9-13.89c-2.51-4.961-7.27-7.44-14.29-7.44
+ c-7.271,0-12.79,2.46-16.56,7.39c-3.771,4.92-5.65,11.951-5.65,21.08V376h-27.8V232.6H582 M910.4,240.6v30H927.6V290.2H910.4
+ v56.409c0,4.371,0.55,7.101,1.649,8.17c1.101,1.08,4.47,1.621,10.11,1.621c0.84,0,1.73-0.03,2.67-0.101
+ c0.939-0.069,1.859-0.17,2.77-0.3v20.8l-13.18,0.601c-1.082,0.037-2.135,0.056-3.16,0.056c-11.43,0-19.356-2.298-23.779-6.896
+ c-3.121-3.201-4.681-8.121-4.681-14.761v-65.6H868V270.6h14.8v-30H910.4 M709.8,266.2c18.3,0,31.94,5.62,40.92,16.87
+ c8.99,11.24,13.48,24.539,13.48,39.88c0,15.6-4.49,28.94-13.48,40.03c-8.979,11.08-22.62,16.619-40.92,16.619
+ s-31.94-5.539-40.92-16.619c-8.989-11.09-13.479-24.431-13.479-40.03c0-15.341,4.49-28.64,13.479-39.88
+ C677.859,271.82,691.5,266.2,709.8,266.2 M709.75,356.4c8.12,0,14.359-2.891,18.72-8.68c4.351-5.781,6.53-14.011,6.53-24.671
+ c0-10.659-2.18-18.87-6.53-24.62c-4.36-5.75-10.6-8.63-18.72-8.63c-8.13,0-14.38,2.88-18.77,8.63
+ c-4.391,5.75-6.58,13.961-6.58,24.62c0,10.66,2.189,18.89,6.58,24.671C695.37,353.51,701.62,356.4,709.75,356.4 M499.04,267
+ c11.69,0,22.069,2.32,31.149,6.971c9.07,4.639,13.61,13.369,13.61,26.18v48.76c0,3.38,0.07,7.48,0.2,12.29
+ c0.2,3.63,0.75,6.09,1.67,7.39c0.92,1.301,2.29,2.37,4.13,3.21v4.2h-30.271c-0.84-2.141-1.43-4.141-1.75-6.02
+ c-0.329-1.881-0.59-4.021-0.779-6.41c-3.859,4.17-8.311,7.72-13.34,10.65c-6.02,3.449-12.82,5.18-20.41,5.18
+ c-9.68,0-17.67-2.75-23.98-8.26c-6.31-5.5-9.47-13.301-9.47-23.4c0-13.1,5.08-22.57,15.23-28.44c5.56-3.19,13.75-5.47,24.55-6.84
+ l9.529-1.17c5.17-0.649,8.871-1.47,11.101-2.44c3.99-1.699,5.99-4.34,5.99-7.92c0-4.359-1.53-7.38-4.601-9.039
+ c-3.06-1.66-7.56-2.49-13.5-2.49c-6.66,0-11.379,1.619-14.14,4.869c-1.979,2.4-3.3,5.641-3.96,9.73h-26.8
+ c0.59-9.311,3.2-16.95,7.84-22.939C468.41,271.689,481.08,267,499.04,267 M491.359,359.2c6.07,0,11.66-1.761,16.771-5.28
+ c5.12-3.529,7.771-9.949,7.97-19.279V324.26c-1.779,1.11-3.58,2.01-5.39,2.69c-1.81,0.69-4.3,1.319-7.47,1.909l-6.33,1.17
+ c-5.93,1.051-10.189,2.32-12.77,3.82c-4.361,2.551-6.541,6.49-6.541,11.84c0,4.771,1.339,8.211,4.009,10.33
+ C484.279,358.141,487.529,359.2,491.359,359.2 M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58
+ c3,3.711,5.02,8.271,6.06,13.67c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959
+ c-2.49-4.961-7.07-7.431-13.75-7.431c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33
+ c0-6.311-0.65-10.9-1.95-13.76c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07
+ V376h-28.2V269.8h27v15.46c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37
+ c4.02,3.32,7.08,7.58,9.15,12.779c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M865.2,269.4V376h-27
+ v-14.96c-0.261,0.33-0.91,1.3-1.95,2.931c-1.04,1.619-2.28,3.049-3.71,4.289c-4.36,3.9-8.57,6.561-12.64,7.99
+ c-4.07,1.43-8.83,2.15-14.301,2.15c-15.74,0-26.35-5.66-31.81-16.971c-3.06-6.27-4.59-15.5-4.59-27.699V269.4h28.6v64.359
+ c0,6.07,0.71,10.641,2.14,13.711c2.53,5.42,7.49,8.129,14.881,8.129c9.47,0,15.959-3.85,19.459-11.56
+ c1.811-4.181,2.721-9.7,2.721-16.55V269.4H865.2 M582,212.6h-27.8c-11.046,0-20,8.954-20,20v21.182
+ C523.599,249.28,511.796,247,499.04,247c-20.979,0-37.309,5.431-48.668,16.161c-5.107-5.312-10.877-9.27-17.208-11.796
+ c-6.893-2.761-14.068-4.165-21.305-4.165c-10.198,0-19.703,2.213-28.252,6.576c-0.145,0.074-0.289,0.149-0.431,0.227
+ c-0.904,0.49-1.792,1.006-2.664,1.55c-8.252-5.543-18.415-8.353-30.233-8.353c-8.355,0-15.932,1.435-22.647,4.278
+ c-2.458-1.08-5.175-1.679-8.032-1.679h-27c-11.045,0-20,8.954-20,20V376c0,11.046,8.955,20,20,20h28.2
+ c7.177,0,13.472-3.781,17-9.459c3.528,5.678,9.823,9.459,17,9.459h27.8c7.177,0,13.471-3.781,17-9.459
+ c3.528,5.678,9.823,9.459,17,9.459h28.41c3.945,0,7.625-1.143,10.724-3.115c8.044,4.328,17.258,6.516,27.516,6.516
+ c9.591,0,18.534-1.975,26.644-5.875c2.891,1.591,6.19,2.475,9.636,2.475H549.8c0.743,0,1.478-0.04,2.2-0.119
+ c0.723,0.079,1.457,0.119,2.2,0.119H582c9.862,0,18.058-7.139,19.7-16.531c1.643,9.393,9.838,16.531,19.7,16.531H650
+ c6.725,0,12.675-3.318,16.3-8.408c11.611,7.979,26.173,12.008,43.5,12.008c22.084,0,39.678-6.547,52.395-19.475
+ c7.525,9.087,20.741,18.275,43.405,18.275c7.69,0,14.732-1.104,20.93-3.281c0.97-0.341,1.939-0.72,2.908-1.136
+ c2.646,1.292,5.62,2.017,8.763,2.017h27c5.679,0,10.805-2.367,14.445-6.168c7.948,5.119,18.378,7.624,31.614,7.624
+ c1.246,0,2.539-0.022,3.843-0.067c0.076-0.003,0.151-0.006,0.228-0.009l13.18-0.601c10.681-0.487,19.09-9.288,19.09-19.979V356
+ c0-5.798-2.516-11.311-6.896-15.108c-2.94-2.551-6.527-4.16-10.304-4.694v-26.191c9.72-1.362,17.199-9.711,17.199-19.806V270.6
+ c0-10.095-7.479-18.443-17.199-19.806V240.6c0-11.046-8.954-20-20-20H882.8c-11.046,0-20,8.954-20,20v8.801H837
+ c-9.677,0-17.747,6.871-19.601,16.001c-1.852-9.13-9.923-16.001-19.6-16.001h-28.6c-6.813,0-12.833,3.408-16.443,8.612
+ c-3.523-2.381-7.322-4.414-11.38-6.087c-9.217-3.799-19.841-5.726-31.577-5.726s-22.36,1.927-31.577,5.726
+ c-7.925,3.267-14.862,7.909-20.695,13.84c-5.208-6.167-11.636-10.911-19.153-14.131c-0.016-0.007-0.031-0.014-0.047-0.021
+ c-7.824-3.327-16.467-5.015-25.687-5.015c-3.604,0-7.156,0.315-10.641,0.943V232.6C602,221.554,593.046,212.6,582,212.6L582,212.6z
+ M709.75,336.4c-2.254,0-2.562-0.406-2.833-0.764c-0.598-0.787-2.517-3.982-2.517-12.587c0-8.573,1.895-11.722,2.476-12.482
+ c0.263-0.343,0.587-0.768,2.874-0.768c2.241,0,2.542,0.396,2.783,0.715c0.569,0.752,2.467,3.929,2.467,12.535
+ c0,8.638-1.922,11.862-2.511,12.645C712.255,336.006,711.958,336.4,709.75,336.4L709.75,336.4z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.499,388c-14.734,0-16.194-10.602-16.491-15.158
+ c-2.282,0.969-5.548,2.491-8.354,3.799C254.849,383.077,243.715,388,236.501,388c-25.962,0-44.167-21.608-49.721-41.42
+ c-0.496,1.273-1.104,2.537-1.848,3.777l-0.259,0.435l-0.316,0.395c-8.148,10.178-36.573,10.815-36.855,10.815
+ c-13.224,0-22.923-3.371-28.833-10.016c-3.175-3.571-6.704-9.897-5.67-19.862c-0.078-13.16,4.078-39.976,7.317-50.777l1.603-5.348
+ h5.582h11h3.107l2.196,2.198c2.883,2.884,2.607,6.303,2.405,8.801c-0.188,2.295-0.534,6.566-0.213,15.226
+ c0.097,2.288,2.599,9.209,5.632,13.571c2.909-2.997,8.484-10.194,18.782-27.42c1.031-1.728,1.504-2.515,1.852-3.035l4.313-6.47
+ c-2.459-5.739-5.026-12.353-5.562-21.952L171,256.709V256.5c0-1.622,0.274-3.164,0.536-4.655c0.063-0.361,0.141-0.804,0.208-1.224
+ c-1.643-1.129-3.839-2.151-6.13-3.219c-2.105-0.981-4.286-1.998-6.391-3.253c-0.369-0.209-0.732-0.424-1.084-0.646
+ c0.54,1.213,0.863,2.522,0.863,3.995c0,3.938-4.782,14.329-8.794,22.355l-1.475,2.951l-3.172,0.907
+ c-4.74,1.354-14.825,1.835-22.685,1.835c-3.458,0-7.982-0.087-12.876-0.411v1.362c0,1.262,0.243,3.584,0.437,5.449
+ c0.245,2.333,0.395,3.824,0.395,5.052c0,9.625-4.9,16.854-13.795,20.354c-5.909,2.326-12.401,2.647-18.535,2.647
+ c-14.37,0-22.193-2.224-27.005-7.674c-4.932-5.586-4.944-12.661-4.959-20.85c-0.002-1.473-0.004-3.027-0.036-4.666
+ c-0.019-0.987,0.051-4.084,0.19-9.929c0.137-5.841,0.308-13.11,0.308-16.382v-21.006c-4.691-11.945-6.906-23.596-7.927-30.968
+ c-1.042-7.547,0.479-14.028,4.519-19.263c2.712-3.514,6.315-6.115,10.41-8.083V167.5c0-4.225,0-8.547,0.348-12.964
+ c-0.274-0.088-0.551-0.179-0.829-0.27c-7.124-2.318-15.989-5.206-21.714-11.884c-9.206-10.842-14.806-37.737-14.806-40.882
+ c0-9.415,5.693-15.5,14.502-15.5c9.336,0,14.5,8.575,14.5,14.5c0,2.35-0.814,5.752-2.542,12.427
+ c-0.538,2.071-1.259,4.855-1.454,5.917c0.127,5.01,3.023,8.396,5.461,10.37c3.111,2.514,7.279,4.155,11.751,4.676
+ c17.654-45.552,69.792-61.89,110.282-61.89c50.339,0,81.613,26.563,86.226,73.025c15.616-5.543,33.031-11.026,46.774-11.026
+ c10.264,0,22.501,4.947,22.501,28.502c0,26.979-14.823,65.564-47.938,90.951l-5.499,4.217l-4.639-5.151
+ c-6.05-6.721-13.757-10.396-24.254-11.563l-1.745-0.194c0.874,3.85,2.272,7.381,3.797,11.229c1.422,3.59,2.945,7.434,4.069,11.783
+ l0.006-0.038l10.701,14.268c6.913,9.214,14.502,33.55,14.502,46.5c0,0.402-0.011,0.822-0.036,1.257
+ c3.445-4.229,8.915-6.759,15.534-6.759c13.399,0,19.501,8.554,19.501,16.5c0,3.264-1.628,6.606-4.312,11.725
+ c-0.299,0.573-0.668,1.275-1.004,1.937c0.4,0.484,0.85,1.01,1.234,1.457c3.217,3.753,8.081,9.421,8.081,16.884
+ C313,379.379,304.799,388,293.499,388L293.499,388z M246.438,356.085c-0.279,0.348-0.393,0.734-0.435,1.228
+ C246.151,356.929,246.297,356.518,246.438,356.085L246.438,356.085z M270.053,335.944c-1.209,1.354-2.772,2.58-4.778,3.571
+ c1.533-0.104,3.139-0.207,4.788-0.296c-0.04-0.548-0.065-1.122-0.065-1.719C269.998,336.974,270.017,336.455,270.053,335.944
+ L270.053,335.944z M219.022,317.98c0.091,0.007,0.192,0.013,0.299,0.017c0.586-0.088,1.987-0.419,2.938-0.646
+ c0.477-0.113,0.958-0.226,1.438-0.337c-1.721,0.031-3.757,0.146-4.62,0.546C219.061,317.656,219.037,317.793,219.022,317.98
+ L219.022,317.98z M172.535,125.259c8.01,5.611,15.055,13.589,20.567,20.67c2.555-14.029,4.93-23.667,8.843-29.008
+ c-5.7,1.628-9.896,5.062-12.694,7.354c-2.441,2-4.55,3.727-7.75,3.727c-2.044,0-3.801-0.7-6.71-1.858
+ C174.113,125.873,173.356,125.571,172.535,125.259L172.535,125.259z"/>
+ <path fill="#1F1F1F" d="M169.5,79.5c36,0,75,15,79,69h-3c-5-28-16-40-37-40c-16,0-25,12-27,12s-12.5-6-23-6c-21,0-43,12-42,42
+ l-55,11c0-6,0-12,1-18c-7-3-19-5-25-12c-7.5-8.83-13-34-13-36c0-6,3-8,7-8c5,0,7,5,7,7c0,3-4,16-4,18
+ c0,13.355,12.737,23.069,27.8,23.069c0.728,0,1.463-0.023,2.2-0.069C79.5,93.5,134.5,79.5,169.5,79.5 M213.537,119.277
+ c18.366,0.001,22.214,25.926,26.963,39.223c17-6,44-17,62-17c13,0,15,11,15,21c0,26-15,62-45,85c-9-10-20-13-29-14
+ c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1
+ c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7c2,0,18.01-9.73,21-10
+ c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8c10,0,11-19,11-20
+ c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13c-16,0-3-16-15-16
+ c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10c-4-4-10.5-5.83-15.5-8.83
+ c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19c-3.04,0.868-11.171,1.549-20.627,1.549
+ c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5
+ c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39c3.5-20.17,6.83-43.83,13-45
+ C211.555,119.349,212.566,119.277,213.537,119.277 M54.5,250.5c10.601,13.491,30.487,26.055,46.237,26.055
+ c0.593,0,1.182-0.019,1.763-0.055c0,3,0.83,8.5,0.83,10.5c0,15-15.83,15.5-24.83,15.5c-27,0-24.17-8.17-24.5-25.83
+ C53.96,274.67,54.5,256.5,54.5,250.5 M253.5,282.5c6,8,13,31,13,42c0,8-6,10-14,10c-7,0-7-9-7-13
+ C245.5,318.5,251.5,295.5,253.5,282.5 M138.5,283.5c1,1-0.59,3.01,0,19c0.17,4.5,4.83,17.17,11,22
+ c0.394,0.309,0.843,0.454,1.342,0.454c7.473,0,25.783-32.642,27.658-35.454l3,41c0,5,0,11-3,16c-4,5-22,8-31,8c-15,0-29-5-27-22
+ c-0.17-12.17,4-39,7-49H138.5 M169.5,64.5c-22.887,0-47.102,5.267-66.436,14.451c-22.318,10.602-38.762,26.385-48.174,46.081
+ c-2.892-1.323-4.917-3.379-5.317-5.69c0.286-1.215,0.786-3.146,1.146-4.538c1.934-7.468,2.781-11.078,2.781-14.303
+ c0-10.625-8.84-22-22-22c-12.953,0-22,9.458-22,23c0,5.403,4.153,19.196,4.33,19.781c3.642,12.04,7.645,20.521,12.238,25.929
+ l0.022,0.026l0.021,0.025c5.737,6.693,13.633,10.188,20.458,12.587c-0.062,2.329-0.068,4.619-0.069,6.88
+ c-3.329,2.099-6.335,4.7-8.847,7.953c-3.655,4.735-7.666,12.894-6.012,24.87c1.152,8.331,3.418,19.827,7.859,31.553V250.5
+ c0,3.185-0.17,10.406-0.308,16.209c-0.158,6.708-0.211,9.153-0.189,10.261c0.029,1.536,0.031,3.052,0.034,4.518
+ c0.016,8.896,0.031,18.095,6.835,25.802C53.794,316.263,66.235,317.5,78.5,317.5c6.544,0,14.191-0.376,21.283-3.167
+ c2.781-1.094,5.281-2.484,7.479-4.137c-1.056,8.09-1.759,15.937-1.766,21.561c-1.177,12.446,3.429,20.561,7.567,25.214
+ c7.394,8.313,18.98,12.529,34.438,12.529c5.904,0,13.821-0.954,20.661-2.489c6.875-1.544,12.2-3.518,16.228-6.052
+ c2.301,4.51,5.13,8.851,8.412,12.832C204.34,387.79,219.86,395.5,236.5,395.5c8.772,0,20.174-4.999,35.324-12.061
+ c0.02-0.01,0.04-0.019,0.06-0.028c0.447,0.926,0.981,1.858,1.621,2.783c2.932,4.245,8.782,9.306,19.996,9.306
+ c7.6,0,14.536-2.912,19.53-8.2c4.817-5.101,7.47-12.132,7.47-19.8c0-8.514-4.28-14.937-7.848-19.338
+ c2.113-4.158,3.848-8.218,3.848-12.662c0-11.927-9.274-24-27-24c-3.298,0-6.405,0.485-9.255,1.394
+ c-2.485-13.582-8.349-30.865-14.745-39.394l-9.87-13.159c-0.968-3.414-2.118-6.49-3.218-9.3c3.468,1.514,6.374,3.645,8.938,6.493
+ l9.274,10.305l11.002-8.435C316.77,232.461,332.5,191.32,332.5,162.5c0-5.601-0.454-13.9-4.378-21.287
+ c-5.04-9.488-14.14-14.713-25.622-14.713c-12.294,0-26.813,3.88-40.602,8.463c-1.801-9.966-4.853-19.031-9.12-27.063
+ c-5.635-10.608-13.4-19.48-23.079-26.371C214.048,70.389,193.232,64.5,169.5,64.5L169.5,64.5z M153.054,279.371l0.912-0.261
+ l2.951-5.902c1.771-3.542,3.868-8.042,5.472-11.744c0.449-1.035,0.853-1.989,1.216-2.874c0.6,8.092,2.501,14.302,4.513,19.442
+ l-2.098,3.147c-0.447,0.67-0.922,1.462-2.049,3.348c-4.393,7.349-7.832,12.72-10.507,16.643c-0.255-7.689,0.052-11.492,0.22-13.565
+ C153.833,285.754,154.081,282.688,153.054,279.371L153.054,279.371z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M445.01,377.502H416.6c-0.828,0-1.501-0.673-1.501-1.501v-67.812
+ c0-3.775-0.607-6.899-1.808-9.283c-2.233-4.446-6.292-6.605-12.412-6.605c-7.158,0-11.952,2.849-14.657,8.708
+ c-1.406,3.146-2.121,7.051-2.121,11.583v63.41c0,0.828-0.673,1.501-1.501,1.501h-27.8c-0.828,0-1.501-0.673-1.501-1.501v-63.33
+ c0-6.069-0.609-10.49-1.816-13.142c-2.1-4.593-6.162-6.828-12.414-6.828c-7.419,0-12.225,2.26-14.695,6.912
+ c-1.373,2.681-2.073,6.848-2.073,12.368v64.02c0,0.828-0.673,1.501-1.501,1.501h-28.202c-0.828,0-1.501-0.673-1.501-1.501V269.8
+ c0-0.828,0.673-1.501,1.501-1.501h27.001c0.828,0,1.501,0.673,1.501,1.501v10.492c2.533-3.545,4.988-6.237,7.326-8.03
+ c5.624-4.353,12.977-6.562,21.853-6.562c8.402,0,15.317,1.902,20.551,5.65c0.03,0.02,0.057,0.04,0.082,0.063
+ c3.509,2.895,6.334,6.504,8.422,10.749c3.508-5.25,7.753-9.242,12.649-11.891c5.95-3.04,12.626-4.572,19.875-4.572
+ c4.873,0,9.735,0.959,14.446,2.849c4.774,1.902,9.153,5.276,13.018,10.025c3.147,3.89,5.287,8.71,6.37,14.331
+ c0.668,3.688,1.007,9.069,1.007,16.015l-0.189,67.085C446.507,376.831,445.836,377.502,445.01,377.502L445.01,377.502z"/>
+ <path fill="#1F1F1F" d="M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58c3,3.711,5.02,8.271,6.06,13.67
+ c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959c-2.49-4.961-7.07-7.431-13.75-7.431
+ c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33c0-6.311-0.65-10.9-1.95-13.76
+ c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07V376h-28.2V269.8h27v15.46
+ c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37c4.02,3.32,7.08,7.58,9.15,12.779
+ c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M411.86,264.2c-7.485,0-14.391,1.587-20.523,4.718
+ c-0.022,0.011-0.043,0.022-0.065,0.034c-4.465,2.418-8.405,5.893-11.758,10.363c-2.029-3.501-4.587-6.534-7.643-9.058
+ c-0.053-0.045-0.108-0.087-0.164-0.127c-5.497-3.936-12.706-5.931-21.427-5.931c-9.215,0-16.878,2.313-22.776,6.877
+ c-1.614,1.238-3.242,2.832-4.904,4.808V269.8c0-1.657-1.343-3-3-3h-27c-1.657,0-3,1.343-3,3V376c0,1.657,1.343,3,3,3h28.2
+ c1.657,0,3-1.343,3-3v-64.02c0-5.276,0.646-9.214,1.92-11.703c2.165-4.076,6.539-6.077,13.35-6.077
+ c5.682,0,9.194,1.893,11.052,5.957c0.764,1.682,1.678,5.222,1.678,12.513V376c0,1.657,1.343,3,3,3h27.8c1.657,0,3-1.343,3-3v-63.41
+ c0-4.321,0.672-8.018,1.999-10.986c2.453-5.313,6.678-7.804,13.281-7.804c5.574,0,9.091,1.835,11.069,5.776
+ c1.097,2.176,1.651,5.072,1.651,8.613V376c0,1.657,1.343,3,3,3h28.41c1.653,0,2.996-1.338,3-2.991l0.19-67.08
+ c0-7.044-0.346-12.517-1.028-16.275c-1.136-5.897-3.381-10.94-6.679-15.02c-4.031-4.955-8.615-8.479-13.631-10.48
+ C421.97,265.194,416.922,264.2,411.86,264.2L411.86,264.2z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+ <defs>
+ <filter id="MyOpacityMaskFilter" filterUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774" id="SVGID_1_">
+ <g filter="url(#My_OpacityMaskFilter)">
+
+ <image overflow="visible" width="128" height="91" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAItAAADjQAABP//2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAFsAgAMBIgACEQEDEQH/
+xACNAAEAAgMBAQAAAAAAAAAAAAAABQcBBAYCAwEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQQCAwEB
+AAAAAAAAAwQBAgUGABAgERMwElAxFEAWEQABAwIEBAUEAwAAAAAAAAABABECIQMgMUESEFFhIjBx
+gTIEQJGhQlJiFBIBAAAAAAAAAAAAAAAAAAAAUP/aAAwDAQACEQMRAAAAr8GZad70qyHvKHKfdZzp
+qvewam91PYlQa1oVofICXiLCOv38ZGMj56MkITakR49hqVDclRECD6XBVlxm4AAAA8/M91ZavGlZ
+M4J+26rtU9cl0VaFjyNMWmSrGQDU4GxqyO7ia/1Dai/WCc7ist024jWHrrOR2y8fpEypljyZr7qq
+1IIAD15AAHV9PVosuF44b+gAAH//2gAIAQIAAQUA/If/2gAIAQMAAQUA/If/2gAIAQEAAQUA6Vra
+8p646zB9UdHVhRha3apiGmYcQOpbsiJmdX1z7wrjABpdIF4yWtLM1yulmFLGNdXn0m4tjHWbYXTJ
+mVsCAQ9hwI7hZBZc/XXcf/a5i0qLg6kCMkHwqpuf80n5BhVQ8oKlI5kBQRfZQ1Fkeuk42KirERHw
+sR5Dt8eMl0WH7T60rAVfiJHmm8LTRnpgQ+7JYwfrW+C1orA2wFn983LGwwC1ZpbmoBm761fqEl4H
+RzeFV3sdmAOVifPbkq2sshkzY3Jr5gVxZnJAJTKgHcn65pcxDILR6n2xUFsaYTFw+aYxjGGyg3Qd
+haxYe5qSIwNgbENjItsW9pOTMzzVmKhZYz1FlsptbbNyZBonLEtfml5a4yhJBB9bT4ru9qyLsRPI
+D5R+5R9cWzKzuEdqZfpctKRk80EI9izH9pe215t2RMxOC2iFqj3FX6s7utTju72vDuYccn/L/9oA
+CAECAgY/AEP/2gAIAQMCBj8AQ//aAAgBAQEGPwDgIxBJOQCEiNoK3Rr5hbb0DHrpi3CJjHRNcHbz
+wgDM5KN67F5SqgNoTGIR7AXRn8an9dE1y1KmoDr2S+xQFu0WOpDKNz5A3S6oR2gKXbop2pfqfxgB
+IeMD+VFg1MDSDqsQvYFSITRDcJPyUm/bP0wRuSFZVKAGnhS8l6Hjbt/ykAoUZh4ch0UbrasTxthn
+EaqI6eDukWATQkCeE2FRUIxkGILHgZaBgojojM6I/FJ7oljyHqgYyBfFIRzZXPjXpkwlIygZF8zU
+VKBJGSkDII3LWevCXmFGuilEkKV22wm+aEZyJtPXookF3GGQ6IfIt0lAu4Ww16omdwsdAm3FVUnN
+XBW4yZgpRslov7iu+bruX+acssn5ISGuAkqbYRJ2BoULYNDngt3HYOx9VGunF5FSAkEbcC4epxVw
+OMwo27p2kc1W4PumFwP5oi05KO+TROg+m//Z" transform="matrix(1 0 0 1 103 45)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_1_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,382c-9.998,0-10.315-5.942-10.546-10.279
+ c-0.217-4.07-0.465-5.721-4.453-5.721c-1.218,0-7.149,2.766-12.382,5.203C255.8,376.014,242.957,382,236.5,382
+ c-12.534,0-24.353-5.965-33.282-16.796C195.682,356.062,191,344.297,191,334.499v-21.89c-0.17-1.201-0.341-2.459-0.518-3.752
+ c-0.845-6.225-1.805-13.276-3.424-18.945c-1.138-4.55-2.757-8.294-4.324-11.914c-2.56-5.912-5.206-12.029-5.732-21.414
+ c-0.002-1.18,0.212-2.402,0.442-3.695c0.355-2.016,0.799-4.522-0.004-5.328c-2.376-2.377-5.892-4.014-9.292-5.598
+ c-1.994-0.93-4.056-1.889-5.919-3.005c-8.018-4.455-11.089-13.294-13.123-19.146c-0.37-1.066-0.69-1.987-0.997-2.755l-0.038-0.095
+ l-0.025-0.1c-0.816-3.267-2.352-5.857-5.008-9.474c-4.247,2.344-4.152,6.092-4.06,9.727c0.013,0.481,0.023,0.944,0.023,1.384
+ c0,11.657,6.152,18.462,10.225,22.965c2.191,2.423,3.775,4.175,3.775,6.034c0,3.166-8.077,19.509-8.159,19.671l-0.296,0.592
+ l-0.633,0.181c-3.363,0.961-11.819,1.606-21.042,1.606c-7.303,0-25.421-0.454-35.926-4.656
+ c-30.922-10.66-39.625-50.538-41.929-67.187c-0.814-5.892,0.305-10.864,3.325-14.776c6.96-9.015,22.775-10.902,35.482-12.418
+ c8.487-1.01,19.755-2.69,30.65-4.316c5.071-0.757,10.019-1.493,14.48-2.133c0.025-0.116,0.048-0.296,0.048-0.562
+ c0-1.51-0.598-4.632-1.125-7.385c-0.542-2.835-0.875-4.625-0.875-5.616v-6.001c0-11.356,13.95-20.5,25.5-20.5
+ c17.761,0,34.676,23.646,42.804,35.009c0.467,0.654,0.904,1.262,1.304,1.819c0.164-0.953,0.326-1.91,0.488-2.869
+ c4.085-24.071,7.006-38.771,13.125-39.933c1.174-0.168,2.268-0.248,3.317-0.248c16.308,0,21.873,18.76,25.937,32.459
+ c0.671,2.254,1.311,4.413,1.952,6.341c2.131-0.759,4.403-1.588,6.779-2.457C264.544,148.163,286.92,140,302.5,140
+ c16.501,0,16.501,16.934,16.501,22.5c0,25.503-14.097,62.045-45.589,86.19l-1.1,0.843l-0.928-1.03
+ c-6.994-7.771-16.168-12.191-28.05-13.513l-1.984-0.221l0.764-1.845c7.093-17.106,9.554-38.674,5.162-45.25
+ c-0.763-1.145-1.647-1.677-2.776-1.677c-0.789,0-1.146,0.278-1.346,0.486c-1.222,1.269-1.085,4.924-0.984,7.593
+ c0.074,1.938,0.139,3.62-0.208,4.779c-1.132,6.178-3.464,15.332-5.345,22.691c-1.271,4.979-2.585,10.13-2.617,10.963
+ c0,8.704,2.499,15.01,5.145,21.688c2.633,6.646,5.355,13.515,5.355,22.801c0,3.303-4.705,23.461-7.551,33.896l-0.417,1.529
+ l-1.504-0.501C232.255,311,227.348,311,225.499,311c-7.319,0-12.5,0.539-12.5,7.499c0,4.545,3.536,5.5,6.501,5.5
+ c0.724,0,2.461-0.41,4.142-0.808c2.474-0.585,5.031-1.19,6.857-1.19c3.014,0,7.5,1.731,7.5,6.5c0,5.946-5.555,7.321-10.456,8.535
+ c-5.938,1.47-9.543,2.707-9.543,7.465c0,5.075,2.224,5.5,4.5,5.5c0.845-0.146,5.368-2.56,8.67-4.322
+ c6.417-3.424,10.441-5.515,12.195-5.673c0.25-0.022,0.488-0.033,0.711-0.033c2.091,0,3.172,0.936,3.71,1.721
+ c1.59,2.315,0.269,5.939,0.114,6.346l-0.238,0.614l-0.61,0.241c-7.2,2.854-7.12,6.903-7.063,9.859
+ c0.006,0.263,0.011,0.511,0.011,0.746c0,4.068,2.289,6.5,4.499,6.5c8.643,0,9.501-18.314,9.501-18.5v-1.499h1.5
+ c2.734,0,5.946-0.217,9.348-0.444c3.719-0.248,7.553-0.507,11.48-0.551c0.231-1.382,0.072-2.827-0.097-4.339
+ c-0.113-1.024-0.231-2.083-0.231-3.166c0-9.228,7.274-12.5,13.502-12.5c9.963,0,13.5,5.655,13.5,10.5
+ c0,1.88-1.435,4.758-3.625,8.935c-0.976,1.864-2.313,4.413-2.376,5.091c0,1.074,1.71,3.068,3.363,4.997
+ c2.957,3.445,6.636,7.734,6.636,12.976C306.999,376.174,301.574,382,293.5,382L293.5,382z"/>
+ <g>
+ <path fill="#1F1F1F" d="M213.538,119.277c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21
+ c0,26-15,62-45,85c-9-10-20-13-29-14c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5
+ c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7
+ c2,0,18.01-9.73,21-10c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8
+ c10,0,11-19,11-20c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13
+ c-16,0-3-16-15-16c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10
+ c-4-4-10.5-5.83-15.5-8.83c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19
+ c-3.04,0.868-11.171,1.549-20.627,1.549c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66
+ C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39
+ c3.5-20.17,6.83-43.83,13-45C211.555,119.349,212.566,119.277,213.538,119.277 M213.538,116.277L213.538,116.277
+ c-1.121,0-2.285,0.085-3.462,0.253l-0.067,0.009l-0.067,0.013c-7.154,1.356-10.092,16.252-14.208,40.478
+ c-8.547-11.923-25.273-34.53-43.232-34.53c-6.25,0-12.861,2.322-18.139,6.37c-5.631,4.32-8.861,10.017-8.861,15.63v6
+ c0,1.128,0.326,2.887,0.902,5.898c0.415,2.168,0.916,4.785,1.058,6.364c-4.108,0.593-8.54,1.254-13.201,1.949
+ c-10.889,1.624-22.148,3.302-30.614,4.31c-12.988,1.551-29.15,3.481-36.493,12.993c-3.275,4.243-4.495,9.591-3.625,15.896
+ c1.349,9.753,4.34,24.19,10.932,37.593c7.76,15.777,18.523,26.143,31.994,30.81c10.756,4.273,29.043,4.736,36.418,4.736
+ c9.348,0,17.968-0.669,21.452-1.664l1.269-0.362l0.59-1.181c0.34-0.68,8.317-16.676,8.317-20.342c0-2.437-1.747-4.369-4.165-7.043
+ c-3.916-4.332-9.835-10.879-9.835-21.957c0-0.452-0.012-0.929-0.024-1.423c-0.087-3.454,0.041-5.904,2.188-7.644
+ c2.064,2.912,3.25,5.088,3.926,7.794l0.05,0.197l0.075,0.189c0.294,0.734,0.609,1.641,0.973,2.689
+ c1.976,5.687,5.281,15.197,13.81,19.963c1.919,1.147,4.002,2.118,6.018,3.057c3.399,1.584,6.611,3.08,8.799,5.234
+ c0.252,0.677-0.136,2.876-0.347,4.069c-0.23,1.3-0.467,2.645-0.467,3.873v0.084l0.005,0.084c0.54,9.651,3.24,15.891,5.851,21.924
+ c1.614,3.729,3.138,7.252,4.234,11.636l0.012,0.049l0.014,0.048c1.589,5.56,2.54,12.55,3.378,18.716
+ c0.172,1.267,0.34,2.497,0.507,3.673V334.5c0,10.129,4.813,22.26,12.56,31.658c9.218,11.183,21.45,17.342,34.44,17.342
+ c6.791,0,19.8-6.064,30.254-10.938c4.641-2.163,10.408-4.851,11.819-5.062c2.478,0.006,2.669,0.32,2.882,4.301
+ c0.219,4.089,0.626,11.699,12.044,11.699c8.832,0,15-6.579,15-16c0-5.797-3.88-10.319-6.997-13.953
+ c-1.082-1.262-2.686-3.131-2.97-3.964c0.292-0.864,1.411-2.999,2.171-4.449c2.362-4.507,3.796-7.404,3.796-9.634
+ c0-5.973-4.638-12-15-12c-9.112,0-15,5.495-15,14c0,1.166,0.123,2.267,0.241,3.331c0.107,0.968,0.207,1.864,0.204,2.7
+ c-3.537,0.083-7.038,0.317-10.199,0.529c-3.374,0.226-6.562,0.439-9.246,0.439h-2.961l-0.039,2.989
+ c-0.035,2.644-1.656,17.011-8,17.011c-1.21,0-3-1.589-3-5c0-0.244-0.005-0.503-0.01-0.775c-0.057-2.933-0.117-5.966,6.116-8.436
+ l1.223-0.484l0.472-1.228c0.302-0.785,1.707-4.846-0.276-7.733c-0.608-0.886-2.06-2.371-4.945-2.371
+ c-0.274,0-0.561,0.014-0.851,0.04c-1.974,0.178-5.405,1.917-12.763,5.842c-2.98,1.59-7.018,3.744-8.235,4.145
+ c-1.546-0.011-2.731-0.216-2.731-3.999c0-3.57,2.432-4.528,8.404-6.008c4.894-1.212,11.596-2.872,11.596-9.992
+ c0-5.252-4.527-8-9-8c-2.002,0-4.647,0.626-7.205,1.231c-1.293,0.307-3.246,0.769-3.795,0.769c-5,0-5-2.906-5-4
+ c0-5.094,2.882-6,11-6c1.611,0,6.513,0,9.051,0.846l3.009,1.003l0.834-3.06C240.998,301.743,246,280.698,246,277
+ c0-9.572-2.776-16.579-5.461-23.355c-2.583-6.521-5.024-12.68-5.039-21.068c0.119-1.052,1.42-6.151,2.57-10.657
+ c1.876-7.352,4.206-16.483,5.351-22.711c0.392-1.379,0.328-3.073,0.248-5.188c-0.054-1.437-0.219-5.81,0.57-6.5c0,0,0,0,0.001,0
+ c0.011,0,0.1-0.021,0.261-0.021c0.299,0,0.854,0,1.528,1.008c3.675,5.502,2.161,25.852-5.299,43.842l-1.53,3.69l3.97,0.44
+ c11.498,1.277,20.363,5.538,27.101,13.025l1.855,2.061l2.2-1.687c14.329-10.985,26.298-25.655,34.612-42.423
+ c7.457-15.037,11.562-31.003,11.562-44.958c0-5.936,0-24-18-24c-15.847,0-37.457,7.883-54.821,14.218
+ c-1.838,0.67-3.611,1.317-5.304,1.927c-0.479-1.517-0.963-3.148-1.464-4.836C236.714,135.658,230.964,116.277,213.538,116.277
+ L213.538,116.277z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_1_" filterUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223" id="SVGID_2_">
+ <g filter="url(#My_OpacityMaskFilter_1_)">
+
+ <image overflow="visible" width="278" height="268" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAARTAAAJlwAADlr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAQwBFgMBIgACEQEDEQH/
+xACaAAEAAgMBAQAAAAAAAAAAAAAABgcDBAUBAgEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQMEAgEE
+AwEAAAAAAgMBBAUGACARExAwQBIxFBWAITM0IjI1FhEAAgIBAQYFAgUEAwEAAAAAAQIAESEDIDFB
+URIiEDBAYXGRE4GxMlIjocFCYuFyMwQSAQAAAAAAAAAAAAAAAAAAAID/2gAMAwEAAhEDEQAAAK/A
+AAAAPs+Hf7BCEqjprgAzdPrTsp7WtOtjVAAAAAAAAAAB7N4nbRubf16YI/J/kpblXDWJzPr52iy5
+VyeuYa5suOlRMuIAPreOekfSIUm8eOSAAAAADcuCmLhO0AD5i8qxlGb8v5pYG3jyDT3Pkprj27rF
+ed+fbpGOz0fTBk+xjjUp5RTzeHHMhjd7tEH+rK3yrNi19oqres3KQSbbHoAAB8fOUeegB4D0AADl
+dXglatIY7DidrDZ+x49AAAAAAAADz35OBwNWGl65+F3QADyGS2ryLvB3bZpi3zpAAAAeOEdfNT1j
+nbeegAADFl0yt4r1eYWzI+B3wB57iORU0qhQB92vUs4LH9+PsAAA8gU9hJW0yhvQLsycnqnoAAHD
+7cMK6y6fcLQ6mlug8Ee6FYHK1QAdLmi7OnXc/MwAAHG7OMo7Un0DJfP6Q7RcnsQlRlAB81xZFekC
+6vKFmyaju0XFqRThn3EffkAAA2LIq/aLxywKVnSYsh689Hjw5VU2PVZhBktyobWJQ89APIxKNApD
+563JAPv4AAAAAD66fKEw6tdC0c1Uelq6la+EhjwALKrWUlre4cwA+PvwraE2ZWYAAAAAAAAAAAAA
+2tUXP2YNOD0Dz34IdWc2hIAAAAAAAAAAAAABK7Rp23DaeaxtamnxiG8HZ1gAAAAAAAAAAAAADoXD
+TtwGSrrGp0+vnD6eAAAAAAAAAAAAAAA37gp63jfiMy4RCND65Bh8ABlxSYxa9p8Qq/zPgAAAAAAA
+AAAMtsVFNiya9n3GKd+5Z0iFa3Y4g++hPitpvKugZIHPa6IMAAAAAAAAAABt6gtuR0tY5IdfL9lP
+8KyYodGw4VjJxrVZoF687hSMqXky2JAAAAAAAAAAADb1BM+3WP0T+O8L5NrVADu9+B/Rv84AP//a
+AAgBAgABBQD+jL//2gAIAQMAAQUA/oy//9oACAEBAAEFAPiVqrLJ/wDzlmRtULFWfjqUxx0dWsP4
+GmB9bunmuLdGxULo1TF+QVYlfjzWBWasjSOnY+KAyZa1r49quOUoIUuONqKZGY15Tgy2EfRZ6LH7
+HqtSAREdosKhq9wxfaPi4oYO9gkCKfUhgozOHW9eZxTaL+YxXlu4JP0r+my0oaiyrw2PUFsZKMJf
+fyvp9lnE6SMcdpixHJ4N1L3MSUDfwhRNfoMYMdiwgWFX6TKT9ZT5chjl/RHpkUeVGz05rXhAjmrg
+r1maGlSXKOqIVCMPXXAVEhyFBHDSso2HHBKf14/kPaqlIWNdkpq9LlC0Nn1ybAahhLiXpD6L9CGC
+jL6xXyBVNQrJmviEJgErDqzYxKCGP5/phbJ4NG2fF4LIslWq3jlGlOKcfo6QZSqDWV1GsGQuupc+
+7my7VyKP5/ia7nlS1W0/lbSA7I02uMK1auPF6/WHgYmuPBooHgoUPIEY97v25BDPsbG6Ar+aP5Kn
+VK0/A68sARj0qGFhHO0fE2HPDjk4fdP2rFWwL1dMz2jb7sAj7T9tVUJ2scoQT8U57DvbJkaxkuxr
+b5ZW6bTIWrcL3kZzVGwFygX2R7JFAx+2n7RMFHsvL6q3V4kxX+TV/wDW6c9eFKcnZmzb5hH+G/h3
+Qyv7Ow5T9NC9rvxcwWVG2n2ck3xo2Sz5r6Bk360uRrdFhsKXt+W/t6JOVt1e3DEexP43k5/X5peR
+IeJODX7Gw2IXXut81rEpl1/CK+lf1mYiNgyoIVkbhW7PrpeQ/wCCjgw65/G61SOvzC3Jq3cNdFye
+ufxuVvx15mZnV0fa3jfrCfXKZAK6tkzJWndGDvTUuYe6L0+xnqUWK+TqFUtxMxOs7DAcpZNTwgoK
+Ok/+u9sKB5iMkunOJ2ZBRWySXRBhMXb60hs+fI5mZKeiJmJ1PN9xruFodblwwNswXkgwJZCZAWN2
+W1UnC7SmzCXC4Ogv7jvNeSV6Aw1ljdmtVSr7OJqzWzkcMYbD6qVtlR+vZ8HLS4Gj15pYSrOisbfo
+h7a7NXtm+r07VT8tdgStnqDmBEzMz7FDIOpMwm1LZFXLJbAvWfIKJ6CKBjYsgIJuPl9j0X/k1WYi
+v05WvDUbFTmtd94DMCp7BdrTU3SR5X3RBcHca3A22sUM22uPH7fXkc7nf2o9YntOn24NET3joaP2
+XulKIH4cEQ8kiLr06/421WQxXRP43Bcfr/LxtqatvA3IfX6J/G4tiK/zNLvSxET3j1YX1Dd7UyPz
+NKsyLUF9let90LTtVry2/mas2V36B/ZH44++hPGZ6vHMrnFmvIv89v5mDKRyOJnvXyVr9dGc2S06
+zN+5PJt2S5M95+Zhf/Qw/wDr7Aozq21GqzztPzsL/wChh/8AXekXBmdarNJmDrom3WSIlEQXRXrs
+sMRq7DC7r7a8EMjPxMPPa/hSia/M/fVWXkdg8putub1alUFxV8cEKzyFrXckZs/ErM8VjWrcMRP4
+302Qri1MZMUCGGiIl2meCppTFC4XNIxtha+31XueQ8ITMzPxdPyv9kMhi8/hAyCo0ZgtXra6q86f
+gZ+eYOn+zYx+upIVYGsPEVVIg47ju+Naz4+NulTs4DMLeoSEx8YcuVxJO2IJd/mp0pCKrVLW7K11
+cDYKpGl4OHMUQerP4/8AUs/GwuZOgzD59TwVYWyD+shs2GVchWBhTatlVQLm1Aobuw3LMjcsizVs
+wTq9myBK2wgkfj0sjZpljdwiIXtaTG9sKCG3nQmX5Cw7kzM+uCysVodsQeLLZGbjPkj5OF5OqO/e
+fJ29f//aAAgBAgIGPwAZf//aAAgBAwIGPwAZf//aAAgBAQEGPwD0nQg+TOoE/SfyLjn6gJpi2MB1
+Lo8BMpmE6dgzp1Vxz2RqMMtmCxG7Y2mR232+mCLvJoRXZbY5JMGJulERqUG4zAE6d/TxVeZAiY4C
+VCCI2qq5XPptMGKa4bFGN23cY1/GT9PDSX3uL8eL43iPp/tONikUsfYQUnSDzgLk+4EtgT8w0kLL
+ZUbx5mmTzqL8bJBjdt3G0mBr/EwGr6azF+PFh7QtVB5SgseQgpOkHnAdW2+YOwfSDtEws3SiIxrh
+PsVjrqvL02G8MIhPLaKkRm017t4qM/8A9Gn0d2PwgXxIPGXqIGo2IKQCvaDtEwNpviIP9v7HawhP
+4GDp0mz7QD7dA8Z3YHsJ3kmKzr1UQRed0CDgNumFy1WvOb4iHh1f2Ph06SljAdSwOQnepPzAPtjH
+tB2D6T9In6RP0iYWYHn4PkN8T7vD7n/EXSXjvikrBgTA9Kz3u4T7epaEnAPGBhtEx88DOrjdw3zE
+FDh6Yyv9h+c03XeGES+W0TPtA7znwKnjRi/HlWTQnT1C5Yz5TGBOJMT/ALD84nwNps1iO92AaHgh
+ug2Ivx5TMDVCfcZv4i27kIpu7HlN8Qi7CzTUbywiXy2SxjaaNlsDxRx/iQYmeA8kxxw8Bosf0moD
+5LZ4TUe7tjU0l5G4vxsWY3dVCNqE2t9uwumxyuICPJ1K5HwVrpWwYueHkvngZZ3mfcO4YEAHLYOa
+jaKHHE7K5pWOfmLnh5LCrsR9MigSSssbxF0tRqYc4O4Swb2jKB3nPgOrHvAvWPrBTCXcOYdLSbuM
+JJsnedmxvG6Lps3cuDAQfIKmNqIveMgwo4phvEDIaYbiIBqEso4iKOsXygZTsmM37Tf08epGKnmI
+q6p6l5wHq4RtPSa2MLubY7ztrqIaF9wijqgIPkNfKHp35vxGppMVYHhxiF95A2nxwMZDvUkbBCsQ
+DwlnJ8kOhPTxWBWajxBg7hMGYOxZMbPCPqHiceK/I/OIByG02OELcH/Pz+pCVPMTJ6hANQlT7yi4
++s/9B9Zhx9Zlx9YQNQfWFNNrvYsbxEzeBAdkiM4GVN+kwSPiZJPzt/ZY7jj4gO059j6xNQbrAMXO
+8bTj2PrUBOaowHYJhQcTXrTp8AfzinYOeECXus+tq8Govx4dzCYYRgrR3969bp1F+Ize0fT0WpVN
+EzOs07tQmWfW6cX4jheU1EcUwY/1Phu9dpxfiFWhcoLhpRCMQgbtkJpizxMtruFlvHAwqcEb/S6Z
+i/HgzMaqEaORz4TuOOW11EWbgxwjYj9O6/S6b8iImeHgQDQJAP18KQXL1Me0oTEpUJJ9pjRY/hOr
+WQoSTgz4EZQe44Es7z6ZdNjlcGAiMpF3MsxS90wtVPtJgnwyLAxASggtRKQVCJ91QT0G69OuoD23
+3Re67EsZE3RqHCAkdpsX4DUcUWNwXMsJ0dYuWpuNYuxCyilY59OFY/x3v5Re4G5YMIuHnvBEvUPU
+BwMAsCoQrWeQhCsUX+sGqNVuoG95iFzmsw54Rq3+oB02PT+2BdRuk+8/WPrCeoQ/byfaV1dI9pZy
+fEIxqp+rhKBtR6rsv8Lndde97WN8zde97H//2Q==" transform="matrix(1 0 0 1 43 116)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_2_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#CEBC01" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+</g>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M193.772,206.837c-5.358,0-10.236-2.729-13.736-7.683l-0.198-0.28
+ l-0.093-0.33c-8.547-30.246-25.982-48.151-39.992-62.539c-2.949-3.03-5.736-5.89-8.24-8.667l-0.94-1.043l0.662-1.238
+ c3.588-6.719,10.431-10.272,19.783-10.272c5.169,0,10.029,1.066,13.196,1.96c2.665,0.75,5.5,1.129,8.429,1.129
+ c0.004,0,0.006,0,0.01,0c7.256,0,14.981-2.283,22.334-6.601c2.978-1.746,6.236-2.632,9.686-2.632
+ c6.564,0,11.543,3.219,11.753,3.357l1.181,0.775l-0.336,1.373c-4.887,19.923-7.7,46.495-8.604,81.235l-0.006,0.27l-0.078,0.255
+ C206.643,202.342,200.553,206.835,193.772,206.837L193.772,206.837z"/>
+ <path fill="#917013" d="M204.676,110.643c6.042,0,10.654,3.027,10.654,3.027c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-1.729,5.729-7.115,9.506-12.899,9.506c-4.249,0-8.713-2.037-12.101-6.836c-10.51-37.2-34.41-56.19-48.67-72
+ c3.897-7.297,11.292-9.214,18.019-9.214c5.322,0,10.226,1.199,12.651,1.884c2.928,0.824,5.941,1.206,8.975,1.206
+ c8.011,0,16.174-2.662,23.355-6.876C198.988,111.248,201.975,110.643,204.676,110.643 M204.677,106.643
+ C204.677,106.643,204.677,106.643,204.677,106.643c-3.812,0-7.412,0.979-10.701,2.907c-7.053,4.139-14.428,6.327-21.332,6.327
+ c-2.745,0-5.4-0.355-7.892-1.057c-3.285-0.927-8.337-2.033-13.734-2.033c-10.138,0-17.589,3.917-21.547,11.33l-1.323,2.478
+ l1.881,2.086c2.528,2.803,5.326,5.676,8.289,8.718c13.853,14.225,31.094,31.929,39.502,61.69l0.187,0.659l0.396,0.561
+ c3.883,5.5,9.342,8.528,15.369,8.528c7.655,0,14.534-5.078,16.729-12.35l0.155-0.515l0.014-0.537
+ c0.889-34.117,3.764-61.306,8.546-80.812l0.673-2.746l-2.363-1.551C217.296,110.176,211.832,106.643,204.677,106.643
+ L204.677,106.643z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_2_" filterUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193" id="SVGID_3_">
+ <g filter="url(#My_OpacityMaskFilter_2_)">
+
+ <image overflow="visible" width="87" height="99" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAIPAAADBQAAA/v/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAGMAVwMBIgACEQEDEQH/
+xACPAAEAAgMBAQAAAAAAAAAAAAAABgcCAwUBBAEBAAAAAAAAAAAAAAAAAAAAABAAAQQBAwMDBQEA
+AAAAAAAAAwECBAYFABAgETESUCETMDIjMxQ0EQACAQEGAwgDAQAAAAAAAAABAgARECAhMUEDcRIi
+MFFhgZGhMkJigrITEgEAAAAAAAAAAAAAAAAAAABQ/9oADAMBAAIRAxEAAACv2ySEXWJ8xBEowI1n
+MZGQLbaXOKmfaNVkVRIS3Ped0jW2jDL0OH24uVm+YYgk1lUhMSzffm+kA8hE2rwggAGeAsia0lbB
+2HnphWlk1YRcAACawr7i7tnJ6xpqi1anI+AAACxJvS0zJXU0ihhpAAAA2BjiAH//2gAIAQIAAQUA
+9K//2gAIAQMAAQUA9K//2gAIAQEAAQUA5iCUzolalGSTWXiaSK8ZwAed+Oq7TIyoBVkmkjVCUuQj
+kpkpVh0j3gVUAdCxYRtzEQYxS3IuZxUhgj4MgSNY1nirGLpY4l1/MLSDY3exERkd5PLJ6r+efGLi
+8kOSPlbDeEfz/JtWs+QBMdPZIHwXtdJHhH3RVatWsDmrEktOPd/23cifFwCV4SVTOIcY3o9uxPZl
+4d15YbIOhSsJkGyA7SF6CuhXKflTcu7QSIQepX6bj/q5YeUsWbhJaGBqYvQFtIjpnJFVFqOU8gjM
+x7clIY0Nkej5/PEZR0EsWzj+PKWZijlSHSDfQH2J32//2gAIAQICBj8AK//aAAgBAwIGPwAr/9oA
+CAEBAQY/AL/LtqWPhAz1A7hKioMXZObMFHmaQInmYC45ie+U5B6Q8q0PhDysaT5H0gO6C3GDoA8p
+QARjTSbQ0G4n9CAPqc4tKQUExE+M+MwFrcINyuH+qmvAixdrdbDQwY1rffgZz/lze9bRs7rYaEwY
+1umPwNwMpoRkYuzut1CAg3DGBOeF1dxDRlNYqserIiBhraZT8heU16GIBi41qLWgXQm+Nl26lwgY
+WNF4m+jaMaGLjpY0C61JvgjMZRAxxgNYwrpCR49gAT0EwdfvCA2cbcbXLsfv+s+37W//2Q==" transform="matrix(1 0 0 1 131 108)">
+ </image>
+ </g>
+ </mask>
+ <g opacity="0.6" mask="url(#SVGID_3_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#7F3E03" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M210.936,113.796
+ c-11.983,64.227-22.738,60.791-73.726,11.721c0.148-11.045,22.734-5.193,27.431-4c9.14,2.331,19.844,0.864,27.954-4.462
+ C202.85,110.315,210.936,113.796,210.936,113.796z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M166.77,188.01c5.25,0.61,8.37,11.49,9.67,19.44c1.33,8.17,1.33,16.76-4.05,17.47
+ c-8.06,1.08-11.67-21.93-11.67-21.93C158.28,187.29,166.77,188.01,166.77,188.01z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M229.86,192.56c0.99,10.209-3.431,23.959-6.57,24.39
+ c-6.29,0.85-7.51-9.05-7.72-10.7c-0.41-3.3-3.061-24.76,7.939-26.25C228.33,182,229.45,189.26,229.86,192.56z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M216.51,195.85c0.93-8.26,11.79-5.08,11.79,2.86
+ c0,7.95-2.1,14.261-4.34,16.21C217.75,220.32,215.58,204.12,216.51,195.85z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M163.09,206.33c-1.19-8.13,9.59-8.43,11.57-0.891
+ c1.97,7.551,1.6,14.181,0.02,16.721C170.3,229.18,164.28,214.45,163.09,206.33z"/>
+<rect x="701" y="306" fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" stroke="#1F1F1F" stroke-width="20" stroke-linecap="round" stroke-linejoin="round" width="14" height="34"/>
+<circle fill-rule="evenodd" clip-rule="evenodd" fill="#FFFF33" cx="182.5" cy="139.5" r="11.5"/>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_3_" filterUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33" id="SVGID_4_">
+ <g filter="url(#My_OpacityMaskFilter_3_)">
+
+ <image overflow="visible" width="39" height="35" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGnAAAB+QAAAmr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIACMAJwMBIgACEQEDEQH/
+xAB9AAEAAgMBAAAAAAAAAAAAAAAABgcBBAUDAQEAAAAAAAAAAAAAAAAAAAAAEAACAwEAAwEBAAAA
+AAAAAAADBAECBQYQMBEAExEBAAIBAwMDBQAAAAAAAAAAAQACETFBAxBxEiGBkcEiMhMEEgEAAAAA
+AAAAAAAAAAAAAAAw/9oADAMBAAIRAxEAAACAdvxtYgHEurklMuyNm1aPm5YOlHo4aqPjzBnAAf/a
+AAgBAgABBQD0/wD/2gAIAQMAAQUA9P8A/9oACAEBAAEFAIibTncyy3BOKvFH8NxOfk/edThlzMzx
+CDIRzGvlhIJ7PgO1yJKUZSJW4f2kwMYdRql91Nu6h8rrhQMnYLRXY67+1bHJY/ifP//aAAgBAgIG
+PwAf/9oACAEDAgY/AB//2gAIAQEBBj8AAMroQtfIOxM1yMVq2qb7zG8GxkrKvjtMeJLPiaTg4g+3
+l5aVx3sER1zK4elhdp/JjSvPxq9rkOWm2pAvfCajPzPmWpwvks/eubli3uevU+vX/9k=" transform="matrix(1 0 0 1 114 111)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_4_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_4_" filterUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239" id="SVGID_5_">
+ <g filter="url(#My_OpacityMaskFilter_4_)">
+
+ <image overflow="visible" width="34" height="31" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGWAAAB3QAAAkb/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAB8AIgMBIgACEQEDEQH/
+xAB4AAADAQEAAAAAAAAAAAAAAAAABQcGAwEBAAAAAAAAAAAAAAAAAAAAABAAAgIDAQEAAAAAAAAA
+AAAAAgMEBQABBiASEQACAQMDAwUAAAAAAAAAAAABAgAREgMQITFRsQRBcdEiYhIBAAAAAAAAAAAA
+AAAAAAAAIP/aAAwDAQACEQMRAAAAwTkqRLU1vnZkQBrUoy5KrPV6Y5gH/9oACAECAAEFAPX/2gAI
+AQMAAQUA9f/aAAgBAQABBQBSjccbl5Tgk8tMSLksSecugGya+CnSpUBJr6ysBesoJuosystUkmVa
+IBfU2i2awfr6iTrxYSLC/MH7cR5//9oACAECAgY/AF//2gAIAQMCBj8AX//aAAgBAQEGPwAJjFWM
+DEkE9BLlNfcQpkFrDQ3DgiA0h2EbIg+y76C40Dd4tWHENGEZFNSdhoLa3elOYBi8fK46hGPYSj+P
+mQdTjf4hOe6/9Cmn/9k=" transform="matrix(1 0 0 1 202 101)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_5_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M116,85c4-22.67,16.33-29.33,23.67-27.67
+ c7.33,1.67,20,11,30,11c12.33,0,16.66-3,23.66-8.66c7-5.67,10.31,2.33,10,12.33C203,83,207,91.67,204,92s-10.67-18-19-11
+ c-5.33,10.67-2,25.67-12.33,27c-6.7,0.86-21.67-3.67-35-19c-3.07-3.52-12-6-15,1c-3.33,7.75-3.34,4.67-5,8
+ C116.61,100.11,114.86,91.45,116,85z"/>
+<g>
+ <g>
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_5_" filterUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55" id="SVGID_6_">
+ <g filter="url(#My_OpacityMaskFilter_5_)">
+
+ <image overflow="visible" width="60" height="60" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAHLAAACZwAAAyD/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIADwAPAMBIgACEQEDEQH/
+xACFAAACAwEBAQAAAAAAAAAAAAAABwIFBgQBAwEBAAAAAAAAAAAAAAAAAAAAABAAAQQBBAMBAAAA
+AAAAAAAAAgEDBAYFABARFCBAExIRAAEDAgQFBAMAAAAAAAAAAAEAEQJBEiAhMQMQUXGRImGhwWKx
+MhMSAQAAAAAAAAAAAAAAAAAAAED/2gAMAwEAAhEDEQAAAF/6bAorJk9gpKZ5Z8UxYV5aNtbNU+no
+BGQYVdN9TFy2Ua0TUEZB4cpQqvS5cO7hBi3ag+w0chmYEogf/9oACAECAAEFAPQ//9oACAEDAAEF
+APQ//9oACAEBAAEFANIiksKvzpWhpcpUkVGY0MmFIilsiKS1qtfXUPFMMAjDSaciMuJmq4xIby+M
+PHyNV+F2p2KhgwxuYoQ3HFibPC80sUWUwnDXhZwRY34XuVGQLUyI4jjPha5YhH/afaFJKLIrmbbf
+ZAxNNps1thu15rsObY3KyIDmKuDJiNnjKMq2RwHM2w5GnDNw9055HucH9uN//9oACAECAgY/AAf/
+2gAIAQMCBj8AB//aAAgBAQEGPwBAAOToEDbbE909x7ImJJPqFbvQI9acQAHJ0Cjvb0Xkc86IC0L9
+QmMQpeALoxY2HQ8uEXDxj+VFhTAQaqcgMxmFbXRlJ+YUemGfRW/f5RiTmSCokcsMw9Cr6XXe7qG9
+Ghz6KHlqE8S/EknNS2ISd9enEGBeD5hASmx5FPeESJjujDYLvWiM5l5HU4PHWjI2/wBGrqvO5vs/
+zg//2Q==" transform="matrix(1 0 0 1 139 -1)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_6_)">
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M149,22.33c13.33-26.66,39.67-9,40.67,3.34
+ C190.67,38,141.58,37.17,149,22.33z"/>
+</svg>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/assembly/src.xml b/community/mahout-mr/src/main/assembly/src.xml
new file mode 100644
index 0000000..0bb8e8b
--- /dev/null
+++ b/community/mahout-mr/src/main/assembly/src.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+ <id>src</id>
+ <formats>
+ <format>dir</format>
+ <format>tar.gz</format>
+ </formats>
+ <fileSets>
+ <fileSet>
+ <directory>${project.basedir}/..</directory>
+ <outputDirectory/>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ <includes>
+ <include>**/README*</include>
+ <include>**/LICENSE*</include>
+ <include>**/NOTICE*</include>
+ <include>**/pom.xml</include>
+ <include>**/src/**</include>
+ <include>src/conf/**</include>
+ <include>**/build.xml</include>
+ <include>**/*.properties</include>
+ </includes>
+ <excludes>
+ <exclude>**/target/**</exclude>
+ </excludes>
+ </fileSet>
+ <fileSet>
+ <directory>${project.basedir}/../bin</directory>
+ <outputDirectory>bin</outputDirectory>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ <fileMode>0755</fileMode>
+ <directoryMode>0755</directoryMode>
+ </fileSet>
+ <fileSet>
+ <directory>${project.basedir}/../examples/bin</directory>
+ <outputDirectory>examples/bin</outputDirectory>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ <fileMode>0755</fileMode>
+ <directoryMode>0755</directoryMode>
+ <excludes>
+ <exclude>work</exclude>
+ <exclude>work/**</exclude>
+ </excludes>
+ </fileSet>
+ </fileSets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/distribution/pom.xml
----------------------------------------------------------------------
diff --git a/distribution/pom.xml b/distribution/pom.xml
deleted file mode 100644
index 1a84c28..0000000
--- a/distribution/pom.xml
+++ /dev/null
@@ -1,407 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout</artifactId>
- <version>0.13.1-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
- <artifactId>apache-mahout-distribution</artifactId>
- <name>Mahout Release Package</name>
- <description>Distribution Package</description>
- <packaging>pom</packaging>
- <properties>
- <mahout.skip.distribution>true</mahout.skip.distribution>
- <scala.210.version>2.10.6</scala.210.version>
- <scala.211.version>2.11.8</scala.211.version>
- <lifecycle.target>package</lifecycle.target>
- </properties>
- <build>
- <defaultGoal>install</defaultGoal>
- <plugins>
-
- <plugin>
- <groupId>org.codehaus.mojo</groupId>
- <artifactId>exec-maven-plugin</artifactId>
- <version>1.1.1</version>
- <executions>
- <execution>
- <id>change-to-scala-2.11</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../buildtools</workingDirectory>
- <executable>./change-scala-version.sh</executable>
- <arguments>
- <argument>2.11</argument>
- </arguments>
- </configuration>
- </execution>
- <execution>
- <id>mahout-math-scala-2.11</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../math-scala</workingDirectory>
- <executable>mvn</executable>
- <arguments>
- <argument>${lifecycle.target}</argument>
- <argument>-Dscala.version=${scala.211.version}</argument>
- <argument>-Dscala.compat.version=2.11</argument>
- <argument>-DskipTests</argument>
- </arguments>
- </configuration>
- </execution>
- <execution>
- <id>viennacl-2.11</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../experimental/viennacl</workingDirectory>
- <executable>mvn</executable>
- <arguments>
- <argument>${lifecycle.target}</argument>
- <argument>-Dscala.version=${scala.211.version}</argument>
- <argument>-Dscala.compat.version=2.11</argument>
- <argument>-DskipTests</argument>
- </arguments>
- </configuration>
- </execution>
- <execution>
- <id>viennacl-omp-2.11</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../experimental/viennacl-omp</workingDirectory>
- <executable>mvn</executable>
- <arguments>
- <argument>${lifecycle.target}</argument>
- <argument>-Dscala.version=${scala.211.version}</argument>
- <argument>-Dscala.compat.version=2.11</argument>
- <argument>-DskipTests</argument>
- </arguments>
- </configuration>
- </execution>
- <execution>
- <id>spark-2.0</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../spark</workingDirectory>
- <executable>mvn</executable>
- <arguments>
- <argument>${lifecycle.target}</argument>
- <argument>-Dspark.version=2.0.2</argument>
- <argument>-Dspark.compat.version=2.0</argument>
- <argument>-Dscala.version=${scala.211.version}</argument>
- <argument>-Dscala.compat.version=2.11</argument>
- <argument>-DskipTests</argument>
- </arguments>
- </configuration>
- </execution>
- <execution>
- <id>spark-2.1</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../spark</workingDirectory>
- <executable>mvn</executable>
- <arguments>
- <argument>${lifecycle.target}</argument>
- <argument>-Dspark.version=2.1.1</argument>
- <argument>-Dspark.compat.version=2.1</argument>
- <argument>-Dscala.version=${scala.211.version}</argument>
- <argument>-Dscala.compat.version=2.11</argument>
- <argument>-DskipTests</argument>
- </arguments>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-assembly-plugin</artifactId>
- <executions>
- 
- <execution>
- <id>scala_2.10_spark-1.6-assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <skipAssembly>${mahout.skip.distribution}</skipAssembly>
- <descriptors>
- <descriptor>src/main/assembly/scala-2.10_spark-1.6.xml</descriptor>
- </descriptors>
- <tarLongFileMode>gnu</tarLongFileMode>
- <appendAssemblyId>true</appendAssemblyId>
- </configuration>
- </execution>
- <execution>
- <id>scala-2.11_spark-2.0-assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <skipAssembly>${mahout.skip.distribution}</skipAssembly>
- <descriptors>
- <descriptor>src/main/assembly/scala-2.11_spark-2.0.xml</descriptor>
- </descriptors>
- <tarLongFileMode>gnu</tarLongFileMode>
- <appendAssemblyId>true</appendAssemblyId>
- </configuration>
- </execution>
- <execution>
- <id>scala-2.11_spark-2.1-assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <skipAssembly>${mahout.skip.distribution}</skipAssembly>
- <descriptors>
- <descriptor>src/main/assembly/scala-2.11_spark-2.1.xml</descriptor>
- </descriptors>
- <tarLongFileMode>gnu</tarLongFileMode>
- <appendAssemblyId>true</appendAssemblyId>
- </configuration>
- </execution>
- <execution>
- <id>src-assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <skipAssembly>${mahout.skip.distribution}</skipAssembly>
- <descriptors>
- <descriptor>src/main/assembly/src.xml</descriptor>
- </descriptors>
- <tarLongFileMode>gnu</tarLongFileMode>
- <appendAssemblyId>true</appendAssemblyId>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- </plugins>
- </build>
- <profiles>
-
- <profile>
- <id>mahout-release</id>
- <properties>
- <mahout.skip.distribution>false</mahout.skip.distribution>
- </properties>
- </profile>
-
-
- <profile>
- <id>viennacl</id>
- <dependencies>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-integration</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-hdfs</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-mr</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-examples</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-spark_${scala.compat.version}</artifactId>
- <classifier>spark_${spark.compat.version}</classifier>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math-scala_${scala.compat.version}</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-native-viennacl_${scala.compat.version}</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-native-viennacl-omp_${scala.compat.version}</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
- <build>
- <plugins>
- <plugin>
- <groupId>org.codehaus.mojo</groupId>
- <artifactId>exec-maven-plugin</artifactId>
- <version>1.1.1</version>
- <executions>
- <execution>
- <id>viennacl-2.11</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../experimental/viennacl</workingDirectory>
- <executable>mvn</executable>
- <arguments>
- <argument>${lifecycle.target}</argument>
- <argument>-Dscala.version=${scala.211.version}</argument>
- <argument>-Dscala.compat.version=2.11</argument>
- <argument>-DskipTests</argument>
- </arguments>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
- </profile>
-
- <profile>
- <id>viennacl-omp</id>
- <dependencies>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-integration</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-hdfs</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-mr</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-examples</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-spark_${scala.compat.version}</artifactId>
- <classifier>spark_${spark.compat.version}</classifier>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math-scala_${scala.compat.version}</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-native-viennacl-omp_${scala.compat.version}</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
- <build>
- <plugins>
- <plugin>
- <groupId>org.codehaus.mojo</groupId>
- <artifactId>exec-maven-plugin</artifactId>
- <version>1.1.1</version>
- <executions>
- <execution>
- <id>viennacl-omp-2.11</id>
- <phase>compile</phase>
- <goals>
- <goal>exec</goal>
- </goals>
- <configuration>
- <workingDirectory>../experimental/viennacl-omp</workingDirectory>
- <executable>mvn</executable>
- <arguments>
- <argument>${lifecycle.target}</argument>
- <argument>-Dscala.version=${scala.211.version}</argument>
- <argument>-Dscala.compat.version=2.11</argument>
- <argument>-DskipTests</argument>
- </arguments>
- </configuration>
- </execution>
-
- </executions>
- </plugin>
- </plugins>
- </build>
- </profile>
- </profiles>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-integration</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-hdfs</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-mr</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-examples</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-spark_${scala.compat.version}</artifactId>
- <classifier>spark_${spark.compat.version}</classifier>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math-scala_${scala.compat.version}</artifactId>
- </dependency>
- </dependencies>
-</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/distribution/src/main/assembly/scala-2.10_spark-1.6.xml
----------------------------------------------------------------------
diff --git a/distribution/src/main/assembly/scala-2.10_spark-1.6.xml b/distribution/src/main/assembly/scala-2.10_spark-1.6.xml
deleted file mode 100644
index 2cc2095..0000000
--- a/distribution/src/main/assembly/scala-2.10_spark-1.6.xml
+++ /dev/null
@@ -1,249 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
- <id>scala-2.10_spark-1.6</id>
- <formats>
- <format>dir</format>
- <format>tar.gz</format>
- </formats>
-
- <fileSets>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>*.jar</include>
- </includes>
- <excludes>
- <exclude>mahout-*</exclude>
- <exclude>hadoop-*</exclude>
- <exclude>junit-*</exclude>
- </excludes>
- <outputDirectory>lib</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>mahout-collections*.jar</include>
- </includes>
- <outputDirectory>lib</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>hadoop-*.jar</include>
- </includes>
- <outputDirectory>lib/hadoop</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../hdfs/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../mr/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../integration/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- <include>mahout-*.job</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math-scala/target</directory>
- <includes>
- <include>mahout-*2.10*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../spark/target</directory>
- <includes>
- <include>mahout-*2.10*.jar</include>
- <include>mahout-*2.10*dependency-reduced.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../flink/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- <include>mahout-*.job</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../viennacl/target</directory>
- <includes>
- <include>mahout-*2.10*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../viennacl-omp/target</directory>
- <includes>
- <include>mahout-*2.10*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../collections/target/apidocs</directory>
- <outputDirectory>docs/mahout-collections</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math/target/apidocs</directory>
- <outputDirectory>docs/mahout-math</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../hdfs/target/apidocs</directory>
- <outputDirectory>docs/mahout-hdfs</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../mr/target/apidocs</directory>
- <outputDirectory>docs/mahout-mr</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../integration/target/apidocs</directory>
- <outputDirectory>docs/mahout-integration</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/apidocs</directory>
- <outputDirectory>docs/mahout-examples</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math-scala/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-math-scala</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../spark/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-spark</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/..</directory>
- <outputDirectory/>
- <useDefaultExcludes>true</useDefaultExcludes>
- <includes>
- <include>**/README*</include>
- <include>**/LICENSE*</include>
- <include>**/NOTICE*</include>
- <include>**/*.properties</include>
- </includes>
- <excludes>
- <exclude>**/target/**</exclude>
- </excludes>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../bin</directory>
- <outputDirectory>bin</outputDirectory>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../src/conf</directory>
- <outputDirectory>conf</outputDirectory>
- <fileMode>0644</fileMode>
- <directoryMode>0755</directoryMode>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/bin</directory>
- <outputDirectory>examples/bin</outputDirectory>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- <excludes>
- <exclude>work</exclude>
- <exclude>work/**</exclude>
- </excludes>
- </fileSet>
- </fileSets>
-</assembly>

r***@apache.org

2018-06-27 14:52:02 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java
deleted file mode 100644
index 9735ffe..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.hbase;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.HBaseConfiguration;
-import org.apache.hadoop.hbase.HColumnDescriptor;
-import org.apache.hadoop.hbase.HTableDescriptor;
-import org.apache.hadoop.hbase.KeyValue;
-import org.apache.hadoop.hbase.client.Delete;
-import org.apache.hadoop.hbase.client.Get;
-import org.apache.hadoop.hbase.client.HBaseAdmin;
-import org.apache.hadoop.hbase.client.HTableFactory;
-import org.apache.hadoop.hbase.client.HTableInterface;
-import org.apache.hadoop.hbase.client.HTablePool;
-import org.apache.hadoop.hbase.client.Put;
-import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.ResultScanner;
-import org.apache.hadoop.hbase.client.Scan;
-import org.apache.hadoop.hbase.filter.FilterList;
-import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
-import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.NoSuchUserException;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray;
-import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.SortedMap;
-
-/**
- * Naive approach of storing one preference as one value in the table.
- * Preferences are indexed as (user, item) and (item, user) for O(1) lookups.
- *
- * The default table name is "taste", this can be set through a constructor
- * argument. Each row has a value starting with "i" or "u" followed by the
- * actual id encoded as a big endian long.
- *
- * E.g., "u\x00\x00\x00\x00\x00\x00\x04\xd2" is user 1234L
- *
- * There are two column families: "users" and "items".
- *
- * The "users" column family holds user->item preferences. Each userID is the
- * column qualifier and the value is the preference.
- *
- * The "items" column fmaily holds item->user preferences. Each itemID is the
- * column qualifier and the value is the preference.
- *
- * User IDs and item IDs are cached in a FastIDSet since it requires a full
- * table scan to build these sets. Preferences are not cached since they
- * are pretty cheap lookups in HBase (also caching the Preferences defeats
- * the purpose of a scalable storage engine like HBase).
- */
-public final class HBaseDataModel implements DataModel, Closeable {
-
- private static final Logger log = LoggerFactory.getLogger(HBaseDataModel.class);
-
- private static final String DEFAULT_TABLE = "taste";
- private static final byte[] USERS_CF = Bytes.toBytes("users");
- private static final byte[] ITEMS_CF = Bytes.toBytes("items");
-
- private final HTablePool pool;
- private final String tableName;
-
- // Cache of user and item ids
- private volatile FastIDSet itemIDs;
- private volatile FastIDSet userIDs;
-
- public HBaseDataModel(String zkConnect) throws IOException {
- this(zkConnect, DEFAULT_TABLE);
- }
-
- public HBaseDataModel(String zkConnect, String tableName) throws IOException {
- log.info("Using HBase table {}", tableName);
- Configuration conf = HBaseConfiguration.create();
- conf.set("hbase.zookeeper.quorum", zkConnect);
- HTableFactory tableFactory = new HTableFactory();
- this.pool = new HTablePool(conf, 8, tableFactory);
- this.tableName = tableName;
-
- bootstrap(conf);
- // Warm the cache
- refresh(null);
- }
-
- public HBaseDataModel(HTablePool pool, String tableName, Configuration conf) throws IOException {
- log.info("Using HBase table {}", tableName);
- this.pool = pool;
- this.tableName = tableName;
-
- bootstrap(conf);
-
- // Warm the cache
- refresh(null);
- }
-
- public String getTableName() {
- return tableName;
- }
-
- /**
- * Create the table if it doesn't exist
- */
- private void bootstrap(Configuration conf) throws IOException {
- HTableDescriptor tDesc = new HTableDescriptor(Bytes.toBytes(tableName));
- tDesc.addFamily(new HColumnDescriptor(USERS_CF));
- tDesc.addFamily(new HColumnDescriptor(ITEMS_CF));
- try (HBaseAdmin admin = new HBaseAdmin(conf)) {
- admin.createTable(tDesc);
- log.info("Created table {}", tableName);
- }
- }
-
- /**
- * Prefix a user id with "u" and convert to byte[]
- */
- private static byte[] userToBytes(long userID) {
- ByteBuffer bb = ByteBuffer.allocate(9);
- bb.put((byte)0x75); // The letter "u"
- bb.putLong(userID);
- return bb.array();
- }
-
- /**
- * Prefix an item id with "i" and convert to byte[]
- */
- private static byte[] itemToBytes(long itemID) {
- ByteBuffer bb = ByteBuffer.allocate(9);
- bb.put((byte)0x69); // The letter "i"
- bb.putLong(itemID);
- return bb.array();
- }
-
- /**
- * Extract the id out of a prefix byte[] id
- */
- private static long bytesToUserOrItemID(byte[] ba) {
- ByteBuffer bb = ByteBuffer.wrap(ba);
- return bb.getLong(1);
- }
-
- /* DataModel interface */
-
- @Override
- public LongPrimitiveIterator getUserIDs() {
- return userIDs.iterator();
- }
-
- @Override
- public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
- Result result;
- try {
- HTableInterface table = pool.getTable(tableName);
- Get get = new Get(userToBytes(userID));
- get.addFamily(ITEMS_CF);
- result = table.get(get);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to retrieve user preferences from HBase", e);
- }
-
- if (result.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
-
- SortedMap<byte[], byte[]> families = result.getFamilyMap(ITEMS_CF);
- PreferenceArray prefs = new GenericUserPreferenceArray(families.size());
- prefs.setUserID(0, userID);
- int i = 0;
- for (Map.Entry<byte[], byte[]> entry : families.entrySet()) {
- prefs.setItemID(i, Bytes.toLong(entry.getKey()));
- prefs.setValue(i, Bytes.toFloat(entry.getValue()));
- i++;
- }
- return prefs;
- }
-
- @Override
- public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
- Result result;
- try {
- HTableInterface table = pool.getTable(tableName);
- Get get = new Get(userToBytes(userID));
- get.addFamily(ITEMS_CF);
- result = table.get(get);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to retrieve item IDs from HBase", e);
- }
-
- if (result.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
-
- SortedMap<byte[],byte[]> families = result.getFamilyMap(ITEMS_CF);
- FastIDSet ids = new FastIDSet(families.size());
- for (byte[] family : families.keySet()) {
- ids.add(Bytes.toLong(family));
- }
- return ids;
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() {
- return itemIDs.iterator();
- }
-
- @Override
- public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
- Result result;
- try {
- HTableInterface table = pool.getTable(tableName);
- Get get = new Get(itemToBytes(itemID));
- get.addFamily(USERS_CF);
- result = table.get(get);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to retrieve item preferences from HBase", e);
- }
-
- if (result.isEmpty()) {
- throw new NoSuchItemException(itemID);
- }
-
- SortedMap<byte[], byte[]> families = result.getFamilyMap(USERS_CF);
- PreferenceArray prefs = new GenericItemPreferenceArray(families.size());
- prefs.setItemID(0, itemID);
- int i = 0;
- for (Map.Entry<byte[], byte[]> entry : families.entrySet()) {
- prefs.setUserID(i, Bytes.toLong(entry.getKey()));
- prefs.setValue(i, Bytes.toFloat(entry.getValue()));
- i++;
- }
- return prefs;
- }
-
- @Override
- public Float getPreferenceValue(long userID, long itemID) throws TasteException {
- Result result;
- try {
- HTableInterface table = pool.getTable(tableName);
- Get get = new Get(userToBytes(userID));
- get.addColumn(ITEMS_CF, Bytes.toBytes(itemID));
- result = table.get(get);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to retrieve user preferences from HBase", e);
- }
-
- if (result.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
-
- if (result.containsColumn(ITEMS_CF, Bytes.toBytes(itemID))) {
- return Bytes.toFloat(result.getValue(ITEMS_CF, Bytes.toBytes(itemID)));
- } else {
- return null;
- }
- }
-
- @Override
- public Long getPreferenceTime(long userID, long itemID) throws TasteException {
- Result result;
- try {
- HTableInterface table = pool.getTable(tableName);
- Get get = new Get(userToBytes(userID));
- get.addColumn(ITEMS_CF, Bytes.toBytes(itemID));
- result = table.get(get);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to retrieve user preferences from HBase", e);
- }
-
- if (result.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
-
- if (result.containsColumn(ITEMS_CF, Bytes.toBytes(itemID))) {
- KeyValue kv = result.getColumnLatest(ITEMS_CF, Bytes.toBytes(itemID));
- return kv.getTimestamp();
- } else {
- return null;
- }
- }
-
- @Override
- public int getNumItems() {
- return itemIDs.size();
- }
-
- @Override
- public int getNumUsers() {
- return userIDs.size();
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
- PreferenceArray prefs = getPreferencesForItem(itemID);
- return prefs.length();
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
- Result[] results;
- try {
- HTableInterface table = pool.getTable(tableName);
- List<Get> gets = new ArrayList<>(2);
- gets.add(new Get(itemToBytes(itemID1)));
- gets.add(new Get(itemToBytes(itemID2)));
- gets.get(0).addFamily(USERS_CF);
- gets.get(1).addFamily(USERS_CF);
- results = table.get(gets);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to retrieve item preferences from HBase", e);
- }
-
- if (results[0].isEmpty()) {
- throw new NoSuchItemException(itemID1);
- }
- if (results[1].isEmpty()) {
- throw new NoSuchItemException(itemID2);
- }
-
- // First item
- Result result = results[0];
- SortedMap<byte[], byte[]> families = result.getFamilyMap(USERS_CF);
- FastIDSet idSet1 = new FastIDSet(families.size());
- for (byte[] id : families.keySet()) {
- idSet1.add(Bytes.toLong(id));
- }
-
- // Second item
- result = results[1];
- families = result.getFamilyMap(USERS_CF);
- FastIDSet idSet2 = new FastIDSet(families.size());
- for (byte[] id : families.keySet()) {
- idSet2.add(Bytes.toLong(id));
- }
-
- return idSet1.intersectionSize(idSet2);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- try {
- HTableInterface table = pool.getTable(tableName);
- List<Put> puts = new ArrayList<>(2);
- puts.add(new Put(userToBytes(userID)));
- puts.add(new Put(itemToBytes(itemID)));
- puts.get(0).add(ITEMS_CF, Bytes.toBytes(itemID), Bytes.toBytes(value));
- puts.get(1).add(USERS_CF, Bytes.toBytes(userID), Bytes.toBytes(value));
- table.put(puts);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to store preference in HBase", e);
- }
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- try {
- HTableInterface table = pool.getTable(tableName);
- List<Delete> deletes = new ArrayList<>(2);
- deletes.add(new Delete(userToBytes(userID)));
- deletes.add(new Delete(itemToBytes(itemID)));
- deletes.get(0).deleteColumns(ITEMS_CF, Bytes.toBytes(itemID));
- deletes.get(1).deleteColumns(USERS_CF, Bytes.toBytes(userID));
- table.delete(deletes);
- table.close();
- } catch (IOException e) {
- throw new TasteException("Failed to remove preference from HBase", e);
- }
- }
-
- @Override
- public boolean hasPreferenceValues() {
- return true;
- }
-
- @Override
- public float getMaxPreference() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public float getMinPreference() {
- throw new UnsupportedOperationException();
- }
-
- /* Closeable interface */
-
- @Override
- public void close() throws IOException {
- pool.close();
- }
-
- /* Refreshable interface */
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- if (alreadyRefreshed == null || !alreadyRefreshed.contains(this)) {
- try {
- log.info("Refreshing item and user ID caches");
- long t1 = System.currentTimeMillis();
- refreshItemIDs();
- refreshUserIDs();
- long t2 = System.currentTimeMillis();
- log.info("Finished refreshing caches in {} ms", t2 - t1);
- } catch (IOException e) {
- throw new IllegalStateException("Could not reload DataModel", e);
- }
- }
- }
-
- /*
- * Refresh the item id cache. Warning: this does a large table scan
- */
- private synchronized void refreshItemIDs() throws IOException {
- // Get the list of item ids
- HTableInterface table = pool.getTable(tableName);
- Scan scan = new Scan(new byte[]{0x69}, new byte[]{0x70});
- scan.setFilter(new FilterList(FilterList.Operator.MUST_PASS_ALL, new KeyOnlyFilter(), new FirstKeyOnlyFilter()));
- ResultScanner scanner = table.getScanner(scan);
- Collection<Long> ids = new LinkedList<>();
- for (Result result : scanner) {
- ids.add(bytesToUserOrItemID(result.getRow()));
- }
- table.close();
-
- // Copy into FastIDSet
- FastIDSet itemIDs = new FastIDSet(ids.size());
- for (long l : ids) {
- itemIDs.add(l);
- }
-
- // Swap with the active
- this.itemIDs = itemIDs;
- }
-
- /*
- * Refresh the user id cache. Warning: this does a large table scan
- */
- private synchronized void refreshUserIDs() throws IOException {
- // Get the list of user ids
- HTableInterface table = pool.getTable(tableName);
- Scan scan = new Scan(new byte[]{0x75}, new byte[]{0x76});
- scan.setFilter(new FilterList(FilterList.Operator.MUST_PASS_ALL, new KeyOnlyFilter(), new FirstKeyOnlyFilter()));
- ResultScanner scanner = table.getScanner(scan);
- Collection<Long> ids = new LinkedList<>();
- for (Result result : scanner) {
- ids.add(bytesToUserOrItemID(result.getRow()));
- }
- table.close();
-
- // Copy into FastIDSet
- FastIDSet userIDs = new FastIDSet(ids.size());
- for (long l : ids) {
- userIDs.add(l);
- }
-
- // Swap with the active
- this.userIDs = userIDs;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java
deleted file mode 100644
index 79ca1ac..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-import javax.sql.DataSource;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.model.BooleanPreference;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.common.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.base.Preconditions;
-
-public abstract class AbstractBooleanPrefJDBCDataModel extends AbstractJDBCDataModel {
-
- private static final Logger log = LoggerFactory.getLogger(AbstractBooleanPrefJDBCDataModel.class);
-
- static final String NO_SUCH_COLUMN = "NO_SUCH_COLUMN";
-
- private final String setPreferenceSQL;
-
- protected AbstractBooleanPrefJDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String preferenceColumn,
- String getPreferenceSQL,
- String getPreferenceTimeSQL,
- String getUserSQL,
- String getAllUsersSQL,
- String getNumItemsSQL,
- String getNumUsersSQL,
- String setPreferenceSQL,
- String removePreferenceSQL,
- String getUsersSQL,
- String getItemsSQL,
- String getPrefsForItemSQL,
- String getNumPreferenceForItemSQL,
- String getNumPreferenceForItemsSQL,
- String getMaxPreferenceSQL,
- String getMinPreferenceSQL) {
- super(dataSource,
- preferenceTable,
- userIDColumn,
- itemIDColumn,
- preferenceColumn,
- getPreferenceSQL,
- getPreferenceTimeSQL,
- getUserSQL,
- getAllUsersSQL,
- getNumItemsSQL,
- getNumUsersSQL,
- setPreferenceSQL,
- removePreferenceSQL,
- getUsersSQL,
- getItemsSQL,
- getPrefsForItemSQL,
- getNumPreferenceForItemSQL,
- getNumPreferenceForItemsSQL,
- getMaxPreferenceSQL,
- getMinPreferenceSQL);
- this.setPreferenceSQL = setPreferenceSQL;
- }
-
- @Override
- protected Preference buildPreference(ResultSet rs) throws SQLException {
- return new BooleanPreference(getLongColumn(rs, 1), getLongColumn(rs, 2));
- }
-
- @Override
- String getSetPreferenceSQL() {
- return setPreferenceSQL;
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
- log.debug("Setting preference for user {}, item {}", userID, itemID);
-
- Connection conn = null;
- PreparedStatement stmt = null;
-
- try {
- conn = getDataSource().getConnection();
- stmt = conn.prepareStatement(setPreferenceSQL);
- setLongParameter(stmt, 1, userID);
- setLongParameter(stmt, 2, itemID);
-
- log.debug("Executing SQL update: {}", setPreferenceSQL);
- stmt.executeUpdate();
-
- } catch (SQLException sqle) {
- log.warn("Exception while setting preference", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(null, stmt, conn);
- }
- }
-
- @Override
- public boolean hasPreferenceValues() {
- return false;
- }
-
- @Override
- public float getMaxPreference() {
- return 1.0f;
- }
-
- @Override
- public float getMinPreference() {
- return 1.0f;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java
deleted file mode 100644
index 66f0a77..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java
+++ /dev/null
@@ -1,787 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.NoSuchUserException;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.Cache;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.Retriever;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-import org.apache.mahout.cf.taste.impl.common.jdbc.ResultSetIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray;
-import org.apache.mahout.cf.taste.impl.model.GenericPreference;
-import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
-import org.apache.mahout.cf.taste.model.JDBCDataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import javax.sql.DataSource;
-
-/**
- * 
- * An abstract superclass for {@link JDBCDataModel} implementations, providing most of the common
- * functionality that any such implementation would need.
- * 
- *
- * 
- * Performance will be a concern with any {@link JDBCDataModel}. There are going to be lots of
- * simultaneous reads and some writes to one table. Make sure the table is set up optimally -- for example,
- * you'll want to establish indexes.
- * 
- *
- * 
- * You'll also want to use connection pooling of some kind. Most J2EE containers like Tomcat provide
- * connection pooling, so make sure the {@link DataSource} it exposes is using pooling. Outside a J2EE
- * container, you can use packages like Jakarta's <a href="http://jakarta.apache.org/commons/dbcp/">DBCP</a>
- * to create a {@link DataSource} on top of your database whose {@link Connection}s are pooled.
- * 
- */
-public abstract class AbstractJDBCDataModel extends AbstractJDBCComponent implements JDBCDataModel {
-
- private static final Logger log = LoggerFactory.getLogger(AbstractJDBCDataModel.class);
-
- public static final String DEFAULT_PREFERENCE_TABLE = "taste_preferences";
- public static final String DEFAULT_USER_ID_COLUMN = "user_id";
- public static final String DEFAULT_ITEM_ID_COLUMN = "item_id";
- public static final String DEFAULT_PREFERENCE_COLUMN = "preference";
- public static final String DEFAULT_PREFERENCE_TIME_COLUMN = "timestamp";
-
- private final DataSource dataSource;
- private final String preferenceTable;
- private final String userIDColumn;
- private final String itemIDColumn;
- private final String preferenceColumn;
- private final String getPreferenceSQL;
- private final String getPreferenceTimeSQL;
- private final String getUserSQL;
- private final String getAllUsersSQL;
- private final String getNumItemsSQL;
- private final String getNumUsersSQL;
- private final String setPreferenceSQL;
- private final String removePreferenceSQL;
- private final String getUsersSQL;
- private final String getItemsSQL;
- private final String getPrefsForItemSQL;
- private final String getNumPreferenceForItemsSQL;
- private final String getMaxPreferenceSQL;
- private final String getMinPreferenceSQL;
- private int cachedNumUsers;
- private int cachedNumItems;
- private final Cache<Long,Integer> itemPrefCounts;
- private float maxPreference;
- private float minPreference;
-
- protected AbstractJDBCDataModel(DataSource dataSource,
- String getPreferenceSQL,
- String getPreferenceTimeSQL,
- String getUserSQL,
- String getAllUsersSQL,
- String getNumItemsSQL,
- String getNumUsersSQL,
- String setPreferenceSQL,
- String removePreferenceSQL,
- String getUsersSQL,
- String getItemsSQL,
- String getPrefsForItemSQL,
- String getNumPreferenceForItemSQL,
- String getNumPreferenceForItemsSQL,
- String getMaxPreferenceSQL,
- String getMinPreferenceSQL) {
- this(dataSource,
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_COLUMN,
- getPreferenceSQL,
- getPreferenceTimeSQL,
- getUserSQL,
- getAllUsersSQL,
- getNumItemsSQL,
- getNumUsersSQL,
- setPreferenceSQL,
- removePreferenceSQL,
- getUsersSQL,
- getItemsSQL,
- getPrefsForItemSQL,
- getNumPreferenceForItemSQL,
- getNumPreferenceForItemsSQL,
- getMaxPreferenceSQL,
- getMinPreferenceSQL);
- }
-
- protected AbstractJDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String preferenceColumn,
- String getPreferenceSQL,
- String getPreferenceTimeSQL,
- String getUserSQL,
- String getAllUsersSQL,
- String getNumItemsSQL,
- String getNumUsersSQL,
- String setPreferenceSQL,
- String removePreferenceSQL,
- String getUsersSQL,
- String getItemsSQL,
- String getPrefsForItemSQL,
- String getNumPreferenceForItemSQL,
- String getNumPreferenceForItemsSQL,
- String getMaxPreferenceSQL,
- String getMinPreferenceSQL) {
-
- log.debug("Creating AbstractJDBCModel...");
-
- AbstractJDBCComponent.checkNotNullAndLog("preferenceTable", preferenceTable);
- AbstractJDBCComponent.checkNotNullAndLog("userIDColumn", userIDColumn);
- AbstractJDBCComponent.checkNotNullAndLog("itemIDColumn", itemIDColumn);
- AbstractJDBCComponent.checkNotNullAndLog("preferenceColumn", preferenceColumn);
-
- AbstractJDBCComponent.checkNotNullAndLog("dataSource", dataSource);
- AbstractJDBCComponent.checkNotNullAndLog("getUserSQL", getUserSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getAllUsersSQL", getAllUsersSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getPreferenceSQL", getPreferenceSQL);
- // getPreferenceTimeSQL can be null
- AbstractJDBCComponent.checkNotNullAndLog("getNumItemsSQL", getNumItemsSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getNumUsersSQL", getNumUsersSQL);
- AbstractJDBCComponent.checkNotNullAndLog("setPreferenceSQL", setPreferenceSQL);
- AbstractJDBCComponent.checkNotNullAndLog("removePreferenceSQL", removePreferenceSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getUsersSQL", getUsersSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getItemsSQL", getItemsSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getPrefsForItemSQL", getPrefsForItemSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getNumPreferenceForItemSQL", getNumPreferenceForItemSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getNumPreferenceForItemsSQL", getNumPreferenceForItemsSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getMaxPreferenceSQL", getMaxPreferenceSQL);
- AbstractJDBCComponent.checkNotNullAndLog("getMinPreferenceSQL", getMinPreferenceSQL);
-
- if (!(dataSource instanceof ConnectionPoolDataSource)) {
- log.warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
- + "to the database itself, or database performance will be severely reduced.");
- }
-
- this.preferenceTable = preferenceTable;
- this.userIDColumn = userIDColumn;
- this.itemIDColumn = itemIDColumn;
- this.preferenceColumn = preferenceColumn;
-
- this.dataSource = dataSource;
- this.getPreferenceSQL = getPreferenceSQL;
- this.getPreferenceTimeSQL = getPreferenceTimeSQL;
- this.getUserSQL = getUserSQL;
- this.getAllUsersSQL = getAllUsersSQL;
- this.getNumItemsSQL = getNumItemsSQL;
- this.getNumUsersSQL = getNumUsersSQL;
- this.setPreferenceSQL = setPreferenceSQL;
- this.removePreferenceSQL = removePreferenceSQL;
- this.getUsersSQL = getUsersSQL;
- this.getItemsSQL = getItemsSQL;
- this.getPrefsForItemSQL = getPrefsForItemSQL;
- //this.getNumPreferenceForItemSQL = getNumPreferenceForItemSQL;
- this.getNumPreferenceForItemsSQL = getNumPreferenceForItemsSQL;
- this.getMaxPreferenceSQL = getMaxPreferenceSQL;
- this.getMinPreferenceSQL = getMinPreferenceSQL;
-
- this.cachedNumUsers = -1;
- this.cachedNumItems = -1;
- this.itemPrefCounts = new Cache<>(new ItemPrefCountRetriever(getNumPreferenceForItemSQL));
-
- this.maxPreference = Float.NaN;
- this.minPreference = Float.NaN;
- }
-
- /** @return the {@link DataSource} that this instance is using */
- @Override
- public DataSource getDataSource() {
- return dataSource;
- }
-
- public String getPreferenceTable() {
- return preferenceTable;
- }
-
- public String getUserIDColumn() {
- return userIDColumn;
- }
-
- public String getItemIDColumn() {
- return itemIDColumn;
- }
-
- public String getPreferenceColumn() {
- return preferenceColumn;
- }
-
- String getSetPreferenceSQL() {
- return setPreferenceSQL;
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() throws TasteException {
- log.debug("Retrieving all users...");
- try {
- return new ResultSetIDIterator(getUsersSQL);
- } catch (SQLException sqle) {
- throw new TasteException(sqle);
- }
- }
-
- /**
- * @throws NoSuchUserException
- * if there is no such user
- */
- @Override
- public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
-
- log.debug("Retrieving user ID '{}'", userID);
-
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
-
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getUserSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
- setLongParameter(stmt, 1, userID);
-
- log.debug("Executing SQL query: {}", getUserSQL);
- rs = stmt.executeQuery();
-
- List<Preference> prefs = new ArrayList<>();
- while (rs.next()) {
- prefs.add(buildPreference(rs));
- }
-
- if (prefs.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
-
- return new GenericUserPreferenceArray(prefs);
-
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving user", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
-
- }
-
- @Override
- public FastByIDMap<PreferenceArray> exportWithPrefs() throws TasteException {
- log.debug("Exporting all data");
-
- Connection conn = null;
- Statement stmt = null;
- ResultSet rs = null;
-
- FastByIDMap<PreferenceArray> result = new FastByIDMap<>();
-
- try {
- conn = dataSource.getConnection();
- stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
-
- log.debug("Executing SQL query: {}", getAllUsersSQL);
- rs = stmt.executeQuery(getAllUsersSQL);
-
- Long currentUserID = null;
- List<Preference> currentPrefs = new ArrayList<>();
- while (rs.next()) {
- long nextUserID = getLongColumn(rs, 1);
- if (currentUserID != null && !currentUserID.equals(nextUserID) && !currentPrefs.isEmpty()) {
- result.put(currentUserID, new GenericUserPreferenceArray(currentPrefs));
- currentPrefs.clear();
- }
- currentPrefs.add(buildPreference(rs));
- currentUserID = nextUserID;
- }
- if (!currentPrefs.isEmpty()) {
- result.put(currentUserID, new GenericUserPreferenceArray(currentPrefs));
- }
-
- return result;
-
- } catch (SQLException sqle) {
- log.warn("Exception while exporting all data", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
-
- }
- }
-
- @Override
- public FastByIDMap<FastIDSet> exportWithIDsOnly() throws TasteException {
- log.debug("Exporting all data");
-
- Connection conn = null;
- Statement stmt = null;
- ResultSet rs = null;
-
- FastByIDMap<FastIDSet> result = new FastByIDMap<>();
-
- try {
- conn = dataSource.getConnection();
- stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
-
- log.debug("Executing SQL query: {}", getAllUsersSQL);
- rs = stmt.executeQuery(getAllUsersSQL);
-
- boolean currentUserIDSet = false;
- long currentUserID = 0L; // value isn't used
- FastIDSet currentItemIDs = new FastIDSet(2);
- while (rs.next()) {
- long nextUserID = getLongColumn(rs, 1);
- if (currentUserIDSet && currentUserID != nextUserID && !currentItemIDs.isEmpty()) {
- result.put(currentUserID, currentItemIDs);
- currentItemIDs = new FastIDSet(2);
- }
- currentItemIDs.add(getLongColumn(rs, 2));
- currentUserID = nextUserID;
- currentUserIDSet = true;
- }
- if (!currentItemIDs.isEmpty()) {
- result.put(currentUserID, currentItemIDs);
- }
-
- return result;
-
- } catch (SQLException sqle) {
- log.warn("Exception while exporting all data", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
-
- }
- }
-
- /**
- * @throws NoSuchUserException
- * if there is no such user
- */
- @Override
- public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
-
- log.debug("Retrieving items for user ID '{}'", userID);
-
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
-
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getUserSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
- setLongParameter(stmt, 1, userID);
-
- log.debug("Executing SQL query: {}", getUserSQL);
- rs = stmt.executeQuery();
-
- FastIDSet result = new FastIDSet();
- while (rs.next()) {
- result.add(getLongColumn(rs, 2));
- }
-
- if (result.isEmpty()) {
- throw new NoSuchUserException(userID);
- }
-
- return result;
-
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving item s", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
-
- }
-
- @Override
- public Float getPreferenceValue(long userID, long itemID) throws TasteException {
- log.debug("Retrieving preferences for item ID '{}'", itemID);
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getPreferenceSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(1);
- setLongParameter(stmt, 1, userID);
- setLongParameter(stmt, 2, itemID);
-
- log.debug("Executing SQL query: {}", getPreferenceSQL);
- rs = stmt.executeQuery();
- if (rs.next()) {
- return rs.getFloat(1);
- } else {
- return null;
- }
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving prefs for item", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
- }
-
- @Override
- public Long getPreferenceTime(long userID, long itemID) throws TasteException {
- if (getPreferenceTimeSQL == null) {
- return null;
- }
- log.debug("Retrieving preference time for item ID '{}'", itemID);
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getPreferenceTimeSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(1);
- setLongParameter(stmt, 1, userID);
- setLongParameter(stmt, 2, itemID);
-
- log.debug("Executing SQL query: {}", getPreferenceTimeSQL);
- rs = stmt.executeQuery();
- if (rs.next()) {
- return rs.getLong(1);
- } else {
- return null;
- }
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving time for item", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() throws TasteException {
- log.debug("Retrieving all items...");
- try {
- return new ResultSetIDIterator(getItemsSQL);
- } catch (SQLException sqle) {
- throw new TasteException(sqle);
- }
- }
-
- @Override
- public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
- List<Preference> list = doGetPreferencesForItem(itemID);
- if (list.isEmpty()) {
- throw new NoSuchItemException(itemID);
- }
- return new GenericItemPreferenceArray(list);
- }
-
- protected List<Preference> doGetPreferencesForItem(long itemID) throws TasteException {
- log.debug("Retrieving preferences for item ID '{}'", itemID);
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getPrefsForItemSQL, ResultSet.TYPE_FORWARD_ONLY,
- ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
- setLongParameter(stmt, 1, itemID);
-
- log.debug("Executing SQL query: {}", getPrefsForItemSQL);
- rs = stmt.executeQuery();
- List<Preference> prefs = new ArrayList<>();
- while (rs.next()) {
- prefs.add(buildPreference(rs));
- }
- return prefs;
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving prefs for item", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
- }
-
- @Override
- public int getNumItems() throws TasteException {
- if (cachedNumItems < 0) {
- cachedNumItems = getNumThings("items", getNumItemsSQL);
- }
- return cachedNumItems;
- }
-
- @Override
- public int getNumUsers() throws TasteException {
- if (cachedNumUsers < 0) {
- cachedNumUsers = getNumThings("users", getNumUsersSQL);
- }
- return cachedNumUsers;
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
- return itemPrefCounts.get(itemID);
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
- return getNumThings("user preferring items", getNumPreferenceForItemsSQL, itemID1, itemID2);
- }
-
- private int getNumThings(String name, String sql, long... args) throws TasteException {
- log.debug("Retrieving number of {} in model", name);
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
- stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
- stmt.setFetchSize(getFetchSize());
- if (args != null) {
- for (int i = 1; i <= args.length; i++) {
- setLongParameter(stmt, i, args[i - 1]);
- }
- }
- log.debug("Executing SQL query: {}", sql);
- rs = stmt.executeQuery();
- rs.next();
- return rs.getInt(1);
- } catch (SQLException sqle) {
- log.warn("Exception while retrieving number of {}", name, sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
-
- log.debug("Setting preference for user {}, item {}", userID, itemID);
-
- Connection conn = null;
- PreparedStatement stmt = null;
-
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(setPreferenceSQL);
- setLongParameter(stmt, 1, userID);
- setLongParameter(stmt, 2, itemID);
- stmt.setDouble(3, value);
- stmt.setDouble(4, value);
-
- log.debug("Executing SQL update: {}", setPreferenceSQL);
- stmt.executeUpdate();
-
- } catch (SQLException sqle) {
- log.warn("Exception while setting preference", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(null, stmt, conn);
- }
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
-
- log.debug("Removing preference for user '{}', item '{}'", userID, itemID);
-
- Connection conn = null;
- PreparedStatement stmt = null;
-
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(removePreferenceSQL);
- setLongParameter(stmt, 1, userID);
- setLongParameter(stmt, 2, itemID);
-
- log.debug("Executing SQL update: {}", removePreferenceSQL);
- stmt.executeUpdate();
-
- } catch (SQLException sqle) {
- log.warn("Exception while removing preference", sqle);
- throw new TasteException(sqle);
- } finally {
- IOUtils.quietClose(null, stmt, conn);
- }
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- cachedNumUsers = -1;
- cachedNumItems = -1;
- minPreference = Float.NaN;
- maxPreference = Float.NaN;
- itemPrefCounts.clear();
- }
-
- @Override
- public boolean hasPreferenceValues() {
- return true;
- }
-
- @Override
- public float getMaxPreference() {
- if (Float.isNaN(maxPreference)) {
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getMaxPreferenceSQL);
-
- log.debug("Executing SQL query: {}", getMaxPreferenceSQL);
- rs = stmt.executeQuery();
- rs.next();
- maxPreference = rs.getFloat(1);
-
- } catch (SQLException sqle) {
- log.warn("Exception while removing preference", sqle);
- // do nothing
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
- }
- return maxPreference;
- }
-
- @Override
- public float getMinPreference() {
- if (Float.isNaN(minPreference)) {
- Connection conn = null;
- PreparedStatement stmt = null;
- ResultSet rs = null;
- try {
- conn = dataSource.getConnection();
- stmt = conn.prepareStatement(getMinPreferenceSQL);
-
- log.debug("Executing SQL query: {}", getMinPreferenceSQL);
- rs = stmt.executeQuery();
- rs.next();
- minPreference = rs.getFloat(1);
-
- } catch (SQLException sqle) {
- log.warn("Exception while removing preference", sqle);
- // do nothing
- } finally {
- IOUtils.quietClose(rs, stmt, conn);
- }
- }
- return minPreference;
- }
-
- // Some overrideable methods to customize the class behavior:
-
- protected Preference buildPreference(ResultSet rs) throws SQLException {
- return new GenericPreference(getLongColumn(rs, 1), getLongColumn(rs, 2), rs.getFloat(3));
- }
-
- /**
- * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
- * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
- * translation.
- */
- protected long getLongColumn(ResultSet rs, int position) throws SQLException {
- return rs.getLong(position);
- }
-
- /**
- * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
- * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
- * translation.
- */
- protected void setLongParameter(PreparedStatement stmt, int position, long value) throws SQLException {
- stmt.setLong(position, value);
- }
-
- /**
- * 
- * An {@link java.util.Iterator} which returns items from a {@link ResultSet}. This is a useful way to
- * iterate over all user data since it does not require all data to be read into memory at once. It does
- * however require that the DB connection be held open. Note that this class will only release database
- * resources after {@link #hasNext()} has been called and has returned {@code false}; callers should
- * make sure to "drain" the entire set of data to avoid tying up database resources.
- * 
- */
- private final class ResultSetIDIterator extends ResultSetIterator<Long> implements LongPrimitiveIterator {
-
- private ResultSetIDIterator(String sql) throws SQLException {
- super(dataSource, sql);
- }
-
- @Override
- protected Long parseElement(ResultSet resultSet) throws SQLException {
- return getLongColumn(resultSet, 1);
- }
-
- @Override
- public long nextLong() {
- return next();
- }
-
- /**
- * @throws UnsupportedOperationException
- */
- @Override
- public long peek() {
- // This could be supported; is it worth it?
- throw new UnsupportedOperationException();
- }
- }
-
- private final class ItemPrefCountRetriever implements Retriever<Long,Integer> {
- private final String getNumPreferenceForItemSQL;
-
- private ItemPrefCountRetriever(String getNumPreferenceForItemSQL) {
- this.getNumPreferenceForItemSQL = getNumPreferenceForItemSQL;
- }
-
- @Override
- public Integer get(Long key) throws TasteException {
- return getNumThings("user preferring item", getNumPreferenceForItemSQL, key);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java
deleted file mode 100644
index ff7f661..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import java.io.PrintWriter;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.SQLFeatureNotSupportedException;
-import java.util.logging.Logger;
-
-import javax.sql.DataSource;
-
-import org.apache.commons.dbcp.ConnectionFactory;
-import org.apache.commons.dbcp.PoolableConnectionFactory;
-import org.apache.commons.dbcp.PoolingDataSource;
-import org.apache.commons.pool.impl.GenericObjectPool;
-
-import com.google.common.base.Preconditions;
-
-/**
- * 
- * A wrapper {@link DataSource} which pools connections.
- * 
- */
-public final class ConnectionPoolDataSource implements DataSource {
-
- private final DataSource delegate;
-
- public ConnectionPoolDataSource(DataSource underlyingDataSource) {
- Preconditions.checkNotNull(underlyingDataSource);
- ConnectionFactory connectionFactory = new ConfiguringConnectionFactory(underlyingDataSource);
- GenericObjectPool objectPool = new GenericObjectPool();
- objectPool.setTestOnBorrow(false);
- objectPool.setTestOnReturn(false);
- objectPool.setTestWhileIdle(true);
- objectPool.setTimeBetweenEvictionRunsMillis(60 * 1000L);
- // Constructor actually sets itself as factory on pool
- new PoolableConnectionFactory(connectionFactory, objectPool, null, "SELECT 1", false, false);
- delegate = new PoolingDataSource(objectPool);
- }
-
- @Override
- public Connection getConnection() throws SQLException {
- return delegate.getConnection();
- }
-
- @Override
- public Connection getConnection(String username, String password) throws SQLException {
- return delegate.getConnection(username, password);
- }
-
- @Override
- public PrintWriter getLogWriter() throws SQLException {
- return delegate.getLogWriter();
- }
-
- @Override
- public void setLogWriter(PrintWriter printWriter) throws SQLException {
- delegate.setLogWriter(printWriter);
- }
-
- @Override
- public void setLoginTimeout(int timeout) throws SQLException {
- delegate.setLoginTimeout(timeout);
- }
-
- @Override
- public int getLoginTimeout() throws SQLException {
- return delegate.getLoginTimeout();
- }
-
- @Override
- public <T> T unwrap(Class<T> iface) throws SQLException {
- return delegate.unwrap(iface);
- }
-
- @Override
- public boolean isWrapperFor(Class<?> iface) throws SQLException {
- return delegate.isWrapperFor(iface);
- }
-
- // This exists for compatibility with Java 7 / JDBC 4.1, but doesn't exist
- // in Java 6. In Java 7 it would @Override, but not in 6.
- // @Override
- public Logger getParentLogger() throws SQLFeatureNotSupportedException {
- throw new SQLFeatureNotSupportedException();
- }
-
- private static class ConfiguringConnectionFactory implements ConnectionFactory {
-
- private final DataSource underlyingDataSource;
-
- ConfiguringConnectionFactory(DataSource underlyingDataSource) {
- this.underlyingDataSource = underlyingDataSource;
- }
-
- @Override
- public Connection createConnection() throws SQLException {
- Connection connection = underlyingDataSource.getConnection();
- connection.setTransactionIsolation(Connection.TRANSACTION_READ_UNCOMMITTED);
- connection.setHoldability(ResultSet.CLOSE_CURSORS_AT_COMMIT);
- return connection;
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java
deleted file mode 100644
index 5dd0be9..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java
+++ /dev/null
@@ -1,146 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Properties;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-
-/**
- * 
- * A generic {@link org.apache.mahout.cf.taste.model.DataModel} designed for use with other JDBC data sources;
- * one just specifies all necessary SQL queries to the constructor here. Optionally, the queries can be
- * specified from a {@link Properties} object, {@link File}, or {@link InputStream}. This class is most
- * appropriate when other existing implementations of {@link AbstractJDBCDataModel} are not suitable. If you
- * are using this class to support a major database, consider contributing a specialized implementation of
- * {@link AbstractJDBCDataModel} to the project for this database.
- * 
- */
-public final class GenericJDBCDataModel extends AbstractJDBCDataModel {
-
- public static final String DATA_SOURCE_KEY = "dataSource";
- public static final String GET_PREFERENCE_SQL_KEY = "getPreferenceSQL";
- public static final String GET_PREFERENCE_TIME_SQL_KEY = "getPreferenceTimeSQL";
- public static final String GET_USER_SQL_KEY = "getUserSQL";
- public static final String GET_ALL_USERS_SQL_KEY = "getAllUsersSQL";
- public static final String GET_NUM_USERS_SQL_KEY = "getNumUsersSQL";
- public static final String GET_NUM_ITEMS_SQL_KEY = "getNumItemsSQL";
- public static final String SET_PREFERENCE_SQL_KEY = "setPreferenceSQL";
- public static final String REMOVE_PREFERENCE_SQL_KEY = "removePreferenceSQL";
- public static final String GET_USERS_SQL_KEY = "getUsersSQL";
- public static final String GET_ITEMS_SQL_KEY = "getItemsSQL";
- public static final String GET_PREFS_FOR_ITEM_SQL_KEY = "getPrefsForItemSQL";
- public static final String GET_NUM_PREFERENCE_FOR_ITEM_KEY = "getNumPreferenceForItemSQL";
- public static final String GET_NUM_PREFERENCE_FOR_ITEMS_KEY = "getNumPreferenceForItemsSQL";
- public static final String GET_MAX_PREFERENCE_KEY = "getMaxPreferenceSQL";
- public static final String GET_MIN_PREFERENCE_KEY = "getMinPreferenceSQL";
-
- /**
- * 
- * Specifies all SQL queries in a {@link Properties} object. See the {@code *_KEY} constants in this
- * class (e.g. {@link #GET_USER_SQL_KEY}) for a list of all keys which must map to a value in this object.
- * 
- *
- * @param props
- * {@link Properties} object containing values
- * @throws TasteException
- * if anything goes wrong during initialization
- */
- public GenericJDBCDataModel(Properties props) throws TasteException {
- super(AbstractJDBCComponent.lookupDataSource(props.getProperty(DATA_SOURCE_KEY)),
- props.getProperty(GET_PREFERENCE_SQL_KEY),
- props.getProperty(GET_PREFERENCE_TIME_SQL_KEY),
- props.getProperty(GET_USER_SQL_KEY),
- props.getProperty(GET_ALL_USERS_SQL_KEY),
- props.getProperty(GET_NUM_ITEMS_SQL_KEY),
- props.getProperty(GET_NUM_USERS_SQL_KEY),
- props.getProperty(SET_PREFERENCE_SQL_KEY),
- props.getProperty(REMOVE_PREFERENCE_SQL_KEY),
- props.getProperty(GET_USERS_SQL_KEY),
- props.getProperty(GET_ITEMS_SQL_KEY),
- props.getProperty(GET_PREFS_FOR_ITEM_SQL_KEY),
- props.getProperty(GET_NUM_PREFERENCE_FOR_ITEM_KEY),
- props.getProperty(GET_NUM_PREFERENCE_FOR_ITEMS_KEY),
- props.getProperty(GET_MAX_PREFERENCE_KEY),
- props.getProperty(GET_MIN_PREFERENCE_KEY));
- }
-
- /**
- * 
- * See {@link #GenericJDBCDataModel(Properties)}. This constructor reads values from a file
- * instead, as if with {@link Properties#load(InputStream)}. So, the file should be in standard Java
- * properties file format -- containing {@code key=value} pairs, one per line.
- * 
- *
- * @param propertiesFile
- * properties file
- * @throws TasteException
- * if anything goes wrong during initialization
- */
- public GenericJDBCDataModel(File propertiesFile) throws TasteException {
- this(getPropertiesFromFile(propertiesFile));
- }
-
- /**
- * 
- * See {@link #GenericJDBCDataModel(Properties)}. This constructor reads values from a resource available in
- * the classpath, as if with {@link Class#getResourceAsStream(String)} and
- * {@link Properties#load(InputStream)}. This is useful if your configuration file is, for example, packaged
- * in a JAR file that is in the classpath.
- * 
- *
- * @param resourcePath
- * path to resource in classpath (e.g. "/com/foo/TasteSQLQueries.properties")
- * @throws TasteException
- * if anything goes wrong during initialization
- */
- public GenericJDBCDataModel(String resourcePath) throws TasteException {
- this(getPropertiesFromStream(GenericJDBCDataModel.class
- .getResourceAsStream(resourcePath)));
- }
-
- private static Properties getPropertiesFromFile(File file) throws TasteException {
- try {
- return getPropertiesFromStream(new FileInputStream(file));
- } catch (FileNotFoundException fnfe) {
- throw new TasteException(fnfe);
- }
- }
-
- private static Properties getPropertiesFromStream(InputStream is) throws TasteException {
- try {
- try {
- Properties props = new Properties();
- props.load(is);
- return props;
- } finally {
- Closeables.close(is, true);
- }
- } catch (IOException ioe) {
- throw new TasteException(ioe);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java
deleted file mode 100644
index 3e9de2c..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java
+++ /dev/null
@@ -1,161 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import javax.sql.DataSource;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-
-/**
- * 
- * See also {@link MySQLJDBCDataModel} -- same except deals with a table without preference info:
- * 
- *
- * 
- *
- * <pre>
- * CREATE TABLE taste_preferences (
- * user_id BIGINT NOT NULL,
- * item_id BIGINT NOT NULL,
- * PRIMARY KEY (user_id, item_id),
- * INDEX (user_id),
- * INDEX (item_id)
- * )
- * </pre>
- *
- * 
- */
-public class MySQLBooleanPrefJDBCDataModel extends AbstractBooleanPrefJDBCDataModel {
-
- /**
- * 
- * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the default {@link javax.sql.DataSource} (named
- * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
- * 
- *
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public MySQLBooleanPrefJDBCDataModel() throws TasteException {
- this(DEFAULT_DATASOURCE_NAME);
- }
-
- /**
- * 
- * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the default {@link javax.sql.DataSource} found
- * under the given name, and using default table/column names.
- * 
- *
- * @param dataSourceName
- * name of {@link javax.sql.DataSource} to look up
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public MySQLBooleanPrefJDBCDataModel(String dataSourceName) throws TasteException {
- this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the given {@link javax.sql.DataSource} and default
- * table/column names.
- * 
- *
- * @param dataSource
- * {@link javax.sql.DataSource} to use
- */
- public MySQLBooleanPrefJDBCDataModel(DataSource dataSource) {
- this(dataSource,
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the given {@link javax.sql.DataSource} and default
- * table/column names.
- * 
- *
- * @param dataSource
- * {@link javax.sql.DataSource} to use
- * @param preferenceTable
- * name of table containing preference data
- * @param userIDColumn
- * user ID column name
- * @param itemIDColumn
- * item ID column name
- * @param timestampColumn timestamp column name (may be null)
- */
- public MySQLBooleanPrefJDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String timestampColumn) {
- super(dataSource, preferenceTable, userIDColumn, itemIDColumn,
- NO_SUCH_COLUMN,
- // getPreferenceSQL
- "SELECT 1 FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
- // getPreferenceTimeSQL
- "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
- + itemIDColumn + "=?",
- // getUserSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
- + userIDColumn + "=?",
- // getAllUsersSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY "
- + userIDColumn,
- // getNumItemsSQL
- "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
- // getNumUsersSQL
- "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
- // setPreferenceSQL
- "INSERT IGNORE INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ") VALUES (?,?)",
- // removePreference SQL
- "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
- // getUsersSQL
- "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
- // getItemsSQL
- "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
- // getPrefsForItemSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
- + itemIDColumn + "=? ORDER BY " + userIDColumn,
- // getNumPreferenceForItemSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
- // getNumPreferenceForItemsSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
- + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
- // getMaxPreferenceSQL
- "SELECT 1.0",
- // getMinPreferenceSQL
- "SELECT 1.0");
- }
-
- @Override
- protected int getFetchSize() {
- // Need to return this for MySQL Connector/J to make it use streaming mode
- return Integer.MIN_VALUE;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java
deleted file mode 100644
index 9904c7e..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java
+++ /dev/null
@@ -1,247 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import javax.sql.DataSource;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
-
-/**
- * 
- * A {@link org.apache.mahout.cf.taste.model.JDBCDataModel} backed by a MySQL database and
- * accessed via JDBC. It may work with other JDBC databases. By default, this class
- * assumes that there is a {@link DataSource} available under the JNDI name
- * "jdbc/taste", which gives access to a database with a "taste_preferences" table with the following schema:
- * 
- *
- * <table>
- * <tr>
- * <th>user_id</th>
- * <th>item_id</th>
- * <th>preference</th>
- * </tr>
- * <tr>
- * <td>987</td>
- * <td>123</td>
- * <td>0.9</td>
- * </tr>
- * <tr>
- * <td>987</td>
- * <td>456</td>
- * <td>0.1</td>
- * </tr>
- * <tr>
- * <td>654</td>
- * <td>123</td>
- * <td>0.2</td>
- * </tr>
- * <tr>
- * <td>654</td>
- * <td>789</td>
- * <td>0.3</td>
- * </tr>
- * </table>
- *
- * 
- * {@code preference} must have a type compatible with the Java {@code float} type.
- * {@code user_id} and {@code item_id} should be compatible with long type (BIGINT). For example,
- * the following command sets up a suitable table in MySQL, complete with primary key and indexes:
- * 
- *
- * 
- *
- * <pre>
- * CREATE TABLE taste_preferences (
- * user_id BIGINT NOT NULL,
- * item_id BIGINT NOT NULL,
- * preference FLOAT NOT NULL,
- * PRIMARY KEY (user_id, item_id),
- * INDEX (user_id),
- * INDEX (item_id)
- * )
- * </pre>
- *
- * 
- *
- * The table may optionally have a {@code timestamp} column whose type is compatible with Java
- * {@code long}.
- *
- * <h3>Performance Notes</h3>
- *
- * 
- * See the notes in {@link AbstractJDBCDataModel} regarding using connection pooling. It's pretty vital to
- * performance.
- * 
- *
- * 
- * Some experimentation suggests that MySQL's InnoDB engine is faster than MyISAM for these kinds of
- * applications. While MyISAM is the default and, I believe, generally considered the lighter-weight and
- * faster of the two engines, my guess is the row-level locking of InnoDB helps here. Your mileage may vary.
- * 
- *
- * 
- * Here are some key settings that can be tuned for MySQL, and suggested size for a data set of around 1
- * million elements:
- * 
- *
- * <ul>
- * <li>innodb_buffer_pool_size=64M</li>
- * <li>myisam_sort_buffer_size=64M</li>
- * <li>query_cache_limit=64M</li>
- * <li>query_cache_min_res_unit=512K</li>
- * <li>query_cache_type=1</li>
- * <li>query_cache_size=64M</li>
- * </ul>
- *
- * 
- * Also consider setting some parameters on the MySQL Connector/J driver:
- * 
- *
- * <pre>
- * cachePreparedStatements = true
- * cachePrepStmts = true
- * cacheResultSetMetadata = true
- * alwaysSendSetIsolation = false
- * elideSetAutoCommits = true
- * </pre>
- *
- * 
- * Thanks to Amila Jayasooriya for contributing MySQL notes above as part of Google Summer of Code 2007.
- * 
- */
-public class MySQLJDBCDataModel extends AbstractJDBCDataModel {
-
- /**
- * 
- * Creates a {@link MySQLJDBCDataModel} using the default {@link DataSource} (named
- * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
- * 
- *
- * @throws TasteException
- * if {@link DataSource} can't be found
- */
- public MySQLJDBCDataModel() throws TasteException {
- this(DEFAULT_DATASOURCE_NAME);
- }
-
- /**
- * 
- * Creates a {@link MySQLJDBCDataModel} using the default {@link DataSource} found under the given name, and
- * using default table/column names.
- * 
- *
- * @param dataSourceName
- * name of {@link DataSource} to look up
- * @throws TasteException
- * if {@link DataSource} can't be found
- */
- public MySQLJDBCDataModel(String dataSourceName) throws TasteException {
- this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a {@link MySQLJDBCDataModel} using the given {@link DataSource} and default table/column names.
- * 
- *
- * @param dataSource
- * {@link DataSource} to use
- */
- public MySQLJDBCDataModel(DataSource dataSource) {
- this(dataSource,
- DEFAULT_PREFERENCE_TABLE,
- DEFAULT_USER_ID_COLUMN,
- DEFAULT_ITEM_ID_COLUMN,
- DEFAULT_PREFERENCE_COLUMN,
- DEFAULT_PREFERENCE_TIME_COLUMN);
- }
-
- /**
- * 
- * Creates a {@link MySQLJDBCDataModel} using the given {@link DataSource} and default table/column names.
- * 
- *
- * @param dataSource
- * {@link DataSource} to use
- * @param preferenceTable
- * name of table containing preference data
- * @param userIDColumn
- * user ID column name
- * @param itemIDColumn
- * item ID column name
- * @param preferenceColumn
- * preference column name
- * @param timestampColumn timestamp column name (may be null)
- */
- public MySQLJDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String preferenceColumn,
- String timestampColumn) {
- super(dataSource, preferenceTable, userIDColumn, itemIDColumn, preferenceColumn,
- // getPreferenceSQL
- "SELECT " + preferenceColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
- + itemIDColumn + "=?",
- // getPreferenceTimeSQL
- "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
- + itemIDColumn + "=?",
- // getUserSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
- + " WHERE " + userIDColumn + "=? ORDER BY " + itemIDColumn,
- // getAllUsersSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
- + " ORDER BY " + userIDColumn + ", " + itemIDColumn,
- // getNumItemsSQL
- "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
- // getNumUsersSQL
- "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
- // setPreferenceSQL
- "INSERT INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ',' + preferenceColumn
- + ") VALUES (?,?,?) ON DUPLICATE KEY UPDATE " + preferenceColumn + "=?",
- // removePreference SQL
- "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
- // getUsersSQL
- "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
- // getItemsSQL
- "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
- // getPrefsForItemSQL
- "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
- + " WHERE " + itemIDColumn + "=? ORDER BY " + userIDColumn,
- // getNumPreferenceForItemSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
- // getNumPreferenceForItemsSQL
- "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
- + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
- "SELECT MAX(" + preferenceColumn + ") FROM " + preferenceTable,
- "SELECT MIN(" + preferenceColumn + ") FROM " + preferenceTable);
- }
-
- @Override
- protected int getFetchSize() {
- // Need to return this for MySQL Connector/J to make it use streaming mode
- return Integer.MIN_VALUE;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java
deleted file mode 100644
index 6dda281..0000000
--- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java
+++ /dev/null
@@ -1,146 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.model.jdbc;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.common.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-
-/**
- * 
- * See also {@link org.apache.mahout.cf.taste.impl.model.jdbc.PostgreSQLJDBCDataModel} --
- * same except deals with a table without preference info:
- * 
- *
- * 
- *
- * <pre>
- * CREATE TABLE taste_preferences (
- * user_id BIGINT NOT NULL,
- * item_id BIGINT NOT NULL,
- * PRIMARY KEY (user_id, item_id)
- * );
- * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
- * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
- * </pre>
- *
- * 
- *
- * @see PostgreSQLJDBCDataModel
- */
-public class PostgreSQLBooleanPrefJDBCDataModel extends SQL92BooleanPrefJDBCDataModel {
-
- private static final Logger log = LoggerFactory.getLogger(PostgreSQLBooleanPrefJDBCDataModel.class);
-
- private static final String POSTGRESQL_DUPLICATE_KEY_STATE = "23505"; // this is brittle...
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} (named
- * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
- * 
- *
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public PostgreSQLBooleanPrefJDBCDataModel() throws TasteException {
- }
-
- /**
- * 
- * Creates a using the default {@link javax.sql.DataSource} found
- * under the given name, and using default table/column names.
- * 
- *
- * @param dataSourceName name of {@link javax.sql.DataSource} to look up
- * @throws org.apache.mahout.cf.taste.common.TasteException
- * if {@link javax.sql.DataSource} can't be found
- */
- public PostgreSQLBooleanPrefJDBCDataModel(String dataSourceName) throws TasteException {
- super(dataSourceName);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default
- * table/column names.
- * 
- *
- * @param dataSource {@link javax.sql.DataSource} to use
- */
- public PostgreSQLBooleanPrefJDBCDataModel(DataSource dataSource) {
- super(dataSource);
- }
-
- /**
- * 
- * Creates a using the given {@link javax.sql.DataSource} and default
- * table/column names.
- * 
- *
- * @param dataSource {@link javax.sql.DataSource} to use
- * @param preferenceTable name of table containing preference data
- * @param userIDColumn user ID column name
- * @param itemIDColumn item ID column name
- * @param timestampColumn timestamp column name (may be null)
- */
- public PostgreSQLBooleanPrefJDBCDataModel(DataSource dataSource,
- String preferenceTable,
- String userIDColumn,
- String itemIDColumn,
- String timestampColumn) {
- super(dataSource, preferenceTable, userIDColumn, itemIDColumn, timestampColumn);
- }
-
- /**
- * Override since PostgreSQL doesn't have the same non-standard capability that MySQL has, to optionally
- * ignore an insert that fails since the row exists already.
- */
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
- log.debug("Setting preference for user {}, item {}", userID, itemID);
-
- String setPreferenceSQL = getSetPreferenceSQL();
- Connection conn = null;
- PreparedStatement stmt = null;
- try {
- conn = getDataSource().getConnection();
- stmt = conn.prepareStatement(setPreferenceSQL);
- setLongParameter(stmt, 1, userID);
- setLongParameter(stmt, 2, itemID);
- log.debug("Executing SQL update: {}", setPreferenceSQL);
- stmt.executeUpdate();
- } catch (SQLException sqle) {
- if (!POSTGRESQL_DUPLICATE_KEY_STATE.equals(sqle.getSQLState())) {
- log.warn("Exception while setting preference", sqle);
- throw new TasteException(sqle);
- }
- } finally {
- IOUtils.quietClose(null, stmt, conn);
- }
- }
-
-}

r***@apache.org

2018-06-27 14:52:06 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/src/appended-resources/supplemental-models.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/appended-resources/supplemental-models.xml b/community/mahout-mr/src/appended-resources/supplemental-models.xml
new file mode 100644
index 0000000..971c72b
--- /dev/null
+++ b/community/mahout-mr/src/appended-resources/supplemental-models.xml
@@ -0,0 +1,279 @@
+<supplementalDataModels>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-profile</artifactId>
+ <name>Maven Profile Model</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-profile/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-project</artifactId>
+ <name>Maven Project Builder</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-project/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-settings</artifactId>
+ <name>Maven Local Settings</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-settings/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-repository-metadata</artifactId>
+ <name>Maven Repository Metadata Model</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-repository-metadata/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-model</artifactId>
+ <name>Maven Model</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.0.8/maven-model/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-artifact</artifactId>
+ <name>Maven Artifact</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-artifact-manager</artifactId>
+ <name>Maven Artifact Manager</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-plugin-api</artifactId>
+ <name>Maven Plugin API</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>wagon-provider-api</artifactId>
+ <name>Maven Wagon API</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.codehouse.mojo</groupId>
+ <artifactId>shade-maven-plugin</artifactId>
+ <name>Shade Maven Plugin</name>
+ <licenses>
+ <license>
+ <name>UNKNOWN</name>
+ <url>UNKNOWN</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <name>Junit Unit testing library</name>
+ <licenses>
+ <license>
+ <name>Common Public License - v 1.0</name>
+ <url>http://junit.sourceforge.net/cpl-v10.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>jdom</groupId>
+ <artifactId>jdom</artifactId>
+ <name>JDom</name>
+ <licenses>
+ <license>
+ <name>UNKOWN</name>
+ <url>UNKOWN</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>asm</groupId>
+ <artifactId>asm-all</artifactId>
+ <name>ASM ALL</name>
+ <licenses>
+ <license>
+ <name>UNKOWN</name>
+ <url>http://asm.ow2.org/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.codehaus.plexus</groupId>
+ <artifactId>plexus-container-default</artifactId>
+ <name>Default Plexus Container</name>
+ <licenses>
+ <license>
+ <name>UNKNOWN</name>
+ <url>UNKNOWN</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.codehouse.classworlds</groupId>
+ <artifactId>classworlds</artifactId>
+ <name>Classworlds</name>
+ <licenses>
+ <license>
+ <name></name>
+ <url>http://classworlds.codehaus.org/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.codehouse.plexus</groupId>
+ <artifactId>plexus-utils</artifactId>
+ <name>Plexus Common Utilities</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://plexus.codehaus.org/plexus-utils/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <name>Commons Codec</name>
+ <url>http://commons.apache.org/codec/</url>
+ <organization>
+ <name>Apache Software Foundation</name>
+ <url>http://www.apache.org/</url>
+ </organization>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <groupId>org.apache.mahout.commons</groupId>
+ <artifactId>commons-cli</artifactId>
+ <name>Commons CLI</name>
+ <url>http://commons.apache.org/cli/</url>
+ <organization>
+ <name>Apache Software Foundation</name>
+ <url>http://www.apache.org/</url>
+ </organization>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ 
+ <supplement>
+ <project>
+ <name>Xml Pull Parser 3rd Edition</name>
+ <groupId>xpp3</groupId>
+ <artifactId>xpp3_min</artifactId>
+ <url>http://www.extreme.indiana.edu/xgws/xsoap/xpp/mxp1/</url>
+ <licenses>
+ <license>
+ <name>Public Domain</name>
+ <url>http://www.xmlpull.org/</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+</supplementalDataModels>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/src/images/logos/ mahout-powered.svg
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/ mahout-powered.svg b/community/mahout-mr/src/images/logos/ mahout-powered.svg
new file mode 100644
index 0000000..ce3ea9f
--- /dev/null
+++ b/community/mahout-mr/src/images/logos/ mahout-powered.svg
@@ -0,0 +1,630 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+ width="956px" height="400px" viewBox="0 0 956 400" enable-background="new 0 0 956 400" xml:space="preserve">
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M709.799,389.6c-21.38,0-37.761-6.839-48.688-20.322
+ c-0.377-0.467-0.747-0.936-1.11-1.408V376c0,5.523-4.478,10.001-10.001,10.001h-28.6c-5.522,0-10-4.478-10-10.001v-64.87
+ c0-4.989-0.908-7.693-1.669-9.083c-0.053-0.096-0.104-0.194-0.154-0.292c-0.32-0.634-0.987-1.954-5.366-1.954
+ c-5.29,0-7.384,1.85-8.617,3.464c-2.353,3.069-3.593,8.255-3.593,15.005V376c0,5.523-4.477,10.001-10,10.001h-27.8
+ c-0.756,0-1.492-0.085-2.201-0.244c-0.708,0.159-1.444,0.244-2.2,0.244h-30.271c-3.453,0-6.61-1.776-8.425-4.61
+ c-0.791,0.505-1.595,0.995-2.412,1.471c-7.595,4.351-16.133,6.54-25.442,6.54c-11.384,0-21.145-3.183-29.042-9.469
+ c-1.529,3.569-5.072,6.068-9.198,6.068h-28.408c-5.523,0-10-4.478-10-10.001v-67.812c0-3.194-0.564-4.789-0.9-5.458
+ c-0.392-0.777-0.97-1.93-4.821-1.93c-4.724,0-5.983,1.728-6.896,3.675c-0.919,2.062-1.383,4.791-1.383,8.114V376
+ c0,5.523-4.477,10.001-10,10.001h-27.8c-5.523,0-10-4.478-10-10.001v-63.33c0-6.95-0.88-9.239-1.055-9.628
+ c-0.349-0.762-0.843-1.841-4.675-1.841c-5.697,0-6.798,1.676-7.151,2.329c-0.298,0.621-1.12,2.837-1.12,8.449V376
+ c0,5.523-4.477,10.001-10,10.001h-28.199c-5.523,0-10-4.478-10-10.001V269.8c0-5.522,4.477-10,10-10h26.999
+ c2.902,0,5.514,1.235,7.34,3.209c6.486-3.852,14.321-5.809,23.34-5.809c10.216,0,18.796,2.437,25.504,7.242
+ c0.185,0.133,0.368,0.272,0.545,0.418c1.322,1.092,2.566,2.262,3.73,3.506c2.438-2.188,5.07-4.048,7.884-5.571
+ c0.07-0.036,0.14-0.073,0.211-0.11c7.126-3.639,15.103-5.484,23.707-5.484c5.958,0,11.882,1.164,17.608,3.456
+ c6.131,2.448,11.667,6.673,16.449,12.554c1.573,1.945,2.946,4.052,4.116,6.313c0.941-1.602,1.974-3.131,3.103-4.586
+ C462.508,263.016,477.94,257,499.041,257c13.235,0,25.249,2.715,35.706,8.067c3.12,1.598,6.458,3.872,9.454,7.101v-39.569
+ c0-5.522,4.477-10,10-10h27.8c5.523,0,10,4.478,10,10v28.484c6.504-2.974,13.447-4.483,20.639-4.483
+ c7.865,0,15.192,1.418,21.774,4.218c7.009,3,12.832,7.627,17.329,13.761c2.014,2.758,3.63,5.599,4.846,8.499
+ c1.368-2.145,2.862-4.229,4.481-6.253c10.92-13.683,27.316-20.624,48.729-20.624c21.414,0,37.812,6.941,48.737,20.633
+ c0.225,0.278,0.444,0.562,0.665,0.843v-8.274c0-5.523,4.477-10,10-10h28.6c5.523,0,10,4.477,10,10v64.358
+ c0,6.407,0.92,8.881,1.203,9.484c0.409,0.88,1.098,2.354,5.816,2.354c6.371,0,8.746-2.222,10.299-5.57
+ c0.86-2.012,1.881-5.809,1.881-12.539v-58.088c0-5.523,4.477-10,10-10h28.201c1.719,0,3.338,0.434,4.749,1.198h2.85v-20.001
+ c0-5.522,4.478-10,10.001-10h27.6c5.522,0,10,4.478,10,10V260.6h7.198c5.523,0,10,4.477,10,10v19.602c0,5.523-4.477,10-10,10H920.4
+ v46.178c0.521,0.013,1.106,0.021,1.76,0.021c0.63,0,1.279-0.023,1.929-0.071c0.704-0.053,1.405-0.129,2.085-0.227
+ c0.475-0.068,0.952-0.103,1.427-0.103c2.388,0,4.717,0.856,6.547,2.442c2.192,1.899,3.451,4.658,3.451,7.558v20.8
+ c0,5.347-4.205,9.745-9.545,9.989l-13.179,0.602c-0.037,0.002-0.076,0.004-0.113,0.004c-1.198,0.042-2.364,0.062-3.501,0.062
+ c-14.403,0-24.539-3.26-30.987-9.963c-2.15-2.205-3.846-4.837-5.072-7.872V376c0,5.523-4.478,10.001-10,10.001H838.2
+ c-3.148,0-5.959-1.456-7.791-3.732c-2.405,1.436-4.804,2.577-7.188,3.416c-5.142,1.804-11.065,2.717-17.621,2.717
+ c-24.711,0-35.835-12.303-40.818-22.626c-0.51-1.045-0.984-2.142-1.422-3.292c-1.476,2.343-3.101,4.608-4.874,6.796
+ C747.562,382.761,731.181,389.6,709.799,389.6L709.799,389.6z M487.944,348.278c0.598,0.447,1.538,0.922,3.414,0.922
+ c4.033,0,7.665-1.15,11.099-3.517c1.935-1.333,2.882-4.174,3.318-7.126c-0.231,0.043-0.465,0.089-0.702,0.133l-6.347,1.172
+ c-6.723,1.191-9.018,2.316-9.562,2.634c-0.961,0.561-1.564,1.024-1.564,3.194C487.601,347.181,487.822,347.995,487.944,348.278
+ L487.944,348.278z M709.751,299.801c-6.414,0-9.15,2.51-10.819,4.697c-3.009,3.937-4.531,10.177-4.531,18.552
+ c0,8.386,1.529,14.651,4.544,18.623c1.671,2.205,4.405,4.728,10.807,4.728c6.375,0,9.085-2.51,10.732-4.697
+ c2.995-3.98,4.517-10.259,4.517-18.653c0-8.384-1.515-14.637-4.504-18.585C718.854,302.297,716.139,299.801,709.751,299.801
+ L709.751,299.801z M491.611,300.711c-0.264,0.336-0.562,0.826-0.854,1.529l7.135-0.875c3.8-0.479,5.996-0.97,7.181-1.304
+ c-1.357-0.335-3.556-0.662-6.974-0.662C493.944,299.399,492.062,300.24,491.611,300.711L491.611,300.711z"/>
+ <path fill="#1F1F1F" d="M582,232.6v50.641c4.02-6.2,8.67-10.52,13.96-12.971c5.28-2.449,10.851-3.67,16.681-3.67
+ c6.549,0,12.5,1.141,17.859,3.42c5.35,2.291,9.74,5.78,13.18,10.471c2.91,3.99,4.7,8.08,5.35,12.289
+ c0.65,4.201,0.971,11.07,0.971,20.601V376h-28.6v-64.87c0-5.739-0.971-10.37-2.9-13.89c-2.51-4.961-7.27-7.44-14.29-7.44
+ c-7.271,0-12.79,2.46-16.56,7.39c-3.771,4.92-5.65,11.951-5.65,21.08V376h-27.8V232.6H582 M910.4,240.6v30H927.6V290.2H910.4
+ v56.409c0,4.371,0.55,7.101,1.649,8.17c1.101,1.08,4.47,1.621,10.11,1.621c0.84,0,1.73-0.03,2.67-0.101
+ c0.939-0.069,1.859-0.17,2.77-0.3v20.8l-13.18,0.601c-1.083,0.037-2.135,0.056-3.161,0.056c-11.429,0-19.356-2.299-23.778-6.896
+ c-3.121-3.201-4.681-8.121-4.681-14.761v-65.6H868V270.6h14.8v-30H910.4 M709.8,266.2c18.3,0,31.94,5.62,40.92,16.87
+ c8.99,11.24,13.48,24.539,13.48,39.88c0,15.6-4.49,28.94-13.48,40.03c-8.979,11.08-22.62,16.619-40.92,16.619
+ s-31.94-5.539-40.92-16.619c-8.989-11.09-13.479-24.431-13.479-40.03c0-15.341,4.49-28.64,13.479-39.88
+ C677.859,271.82,691.5,266.2,709.8,266.2 M709.75,356.4c8.12,0,14.359-2.891,18.72-8.68c4.351-5.781,6.53-14.011,6.53-24.671
+ c0-10.659-2.18-18.87-6.53-24.62c-4.36-5.75-10.6-8.63-18.72-8.63c-8.13,0-14.38,2.88-18.77,8.63
+ c-4.391,5.75-6.58,13.961-6.58,24.62c0,10.66,2.189,18.89,6.58,24.671C695.37,353.51,701.62,356.4,709.75,356.4 M499.04,267
+ c11.69,0,22.069,2.32,31.149,6.971c9.07,4.639,13.61,13.369,13.61,26.18v48.76c0,3.38,0.07,7.48,0.2,12.29
+ c0.2,3.63,0.75,6.09,1.67,7.39c0.92,1.301,2.29,2.37,4.13,3.21v4.2h-30.271c-0.84-2.141-1.43-4.141-1.75-6.02
+ c-0.329-1.881-0.59-4.021-0.779-6.41c-3.859,4.17-8.311,7.72-13.34,10.65c-6.02,3.449-12.82,5.18-20.41,5.18
+ c-9.68,0-17.67-2.75-23.98-8.26c-6.31-5.5-9.47-13.301-9.47-23.4c0-13.1,5.08-22.57,15.23-28.44c5.56-3.19,13.75-5.47,24.55-6.84
+ l9.529-1.17c5.17-0.649,8.871-1.47,11.101-2.44c3.99-1.699,5.99-4.34,5.99-7.92c0-4.359-1.53-7.38-4.601-9.039
+ c-3.06-1.66-7.56-2.49-13.5-2.49c-6.66,0-11.379,1.619-14.14,4.869c-1.979,2.4-3.3,5.641-3.96,9.73h-26.8
+ c0.59-9.311,3.2-16.95,7.84-22.939C468.41,271.689,481.08,267,499.04,267 M491.359,359.2c6.07,0,11.66-1.761,16.771-5.28
+ c5.12-3.529,7.771-9.949,7.97-19.279V324.26c-1.779,1.11-3.58,2.01-5.39,2.69c-1.81,0.69-4.3,1.319-7.47,1.909l-6.33,1.17
+ c-5.93,1.051-10.189,2.32-12.77,3.82c-4.361,2.551-6.541,6.49-6.541,11.84c0,4.771,1.339,8.211,4.009,10.33
+ C484.279,358.141,487.529,359.2,491.359,359.2 M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58
+ c3,3.711,5.02,8.271,6.06,13.67c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959
+ c-2.49-4.961-7.07-7.431-13.75-7.431c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33
+ c0-6.311-0.65-10.9-1.95-13.76c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07
+ V376h-28.2V269.8h27v15.46c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37
+ c4.02,3.32,7.08,7.58,9.15,12.779c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M865.2,269.4V376h-27
+ v-14.96c-0.261,0.33-0.91,1.3-1.95,2.931c-1.04,1.619-2.28,3.049-3.71,4.289c-4.36,3.9-8.57,6.561-12.64,7.99
+ c-4.07,1.43-8.83,2.15-14.301,2.15c-15.74,0-26.35-5.66-31.81-16.971c-3.06-6.27-4.59-15.5-4.59-27.699V269.4h28.6v64.359
+ c0,6.07,0.71,10.641,2.14,13.711c2.53,5.42,7.49,8.129,14.881,8.129c9.47,0,15.959-3.85,19.459-11.56
+ c1.811-4.181,2.721-9.7,2.721-16.55V269.4H865.2 M582,212.6h-27.8c-11.046,0-20,8.954-20,20v21.182
+ C523.599,249.28,511.796,247,499.04,247c-20.979,0-37.309,5.431-48.668,16.161c-5.107-5.312-10.877-9.27-17.208-11.796
+ c-6.893-2.761-14.068-4.165-21.305-4.165c-10.198,0-19.703,2.213-28.252,6.576c-0.145,0.074-0.289,0.149-0.431,0.227
+ c-0.904,0.49-1.792,1.006-2.664,1.55c-8.252-5.543-18.415-8.353-30.233-8.353c-8.355,0-15.932,1.435-22.647,4.278
+ c-2.458-1.08-5.175-1.679-8.032-1.679h-27c-11.045,0-20,8.954-20,20V376c0,11.046,8.955,20,20,20h28.2
+ c7.177,0,13.472-3.781,17-9.459c3.528,5.678,9.823,9.459,17,9.459h27.8c7.177,0,13.471-3.781,17-9.459
+ c3.528,5.678,9.823,9.459,17,9.459h28.41c3.945,0,7.625-1.143,10.724-3.115c8.044,4.328,17.258,6.516,27.516,6.516
+ c9.591,0,18.534-1.975,26.644-5.875c2.891,1.591,6.19,2.475,9.636,2.475H549.8c0.743,0,1.478-0.04,2.2-0.119
+ c0.723,0.079,1.457,0.119,2.2,0.119H582c9.862,0,18.058-7.139,19.7-16.531c1.643,9.393,9.838,16.531,19.7,16.531H650
+ c6.725,0,12.675-3.318,16.3-8.408c11.611,7.979,26.173,12.008,43.5,12.008c22.084,0,39.678-6.547,52.395-19.475
+ c7.525,9.087,20.741,18.275,43.405,18.275c7.69,0,14.732-1.104,20.93-3.281c0.97-0.341,1.939-0.72,2.908-1.136
+ c2.646,1.292,5.62,2.017,8.763,2.017h27c5.679,0,10.805-2.367,14.445-6.168c7.947,5.119,18.379,7.624,31.613,7.624
+ c1.246,0,2.539-0.022,3.843-0.067c0.076-0.003,0.152-0.006,0.229-0.009l13.18-0.601c10.681-0.486,19.09-9.287,19.09-19.979V356
+ c0-5.798-2.516-11.311-6.896-15.108c-2.94-2.551-6.527-4.16-10.304-4.694v-26.191c9.72-1.362,17.199-9.711,17.199-19.806V270.6
+ c0-10.095-7.479-18.443-17.199-19.806V240.6c0-11.046-8.954-20-20-20H882.8c-11.046,0-20,8.954-20,20v8.801H837
+ c-9.677,0-17.747,6.871-19.601,16.001c-1.852-9.13-9.923-16.001-19.6-16.001h-28.6c-6.813,0-12.833,3.408-16.443,8.612
+ c-3.523-2.381-7.322-4.414-11.38-6.087c-9.217-3.799-19.841-5.726-31.577-5.726s-22.36,1.927-31.577,5.726
+ c-7.925,3.267-14.862,7.909-20.695,13.84c-5.208-6.167-11.636-10.911-19.153-14.131c-0.016-0.007-0.031-0.014-0.047-0.021
+ c-7.824-3.327-16.467-5.015-25.687-5.015c-3.604,0-7.156,0.315-10.641,0.943V232.6C602,221.554,593.046,212.6,582,212.6L582,212.6z
+ M709.75,336.4c-2.254,0-2.562-0.406-2.833-0.764c-0.598-0.787-2.517-3.982-2.517-12.587c0-8.573,1.895-11.722,2.476-12.482
+ c0.263-0.343,0.587-0.768,2.874-0.768c2.241,0,2.542,0.396,2.783,0.715c0.569,0.752,2.467,3.929,2.467,12.535
+ c0,8.638-1.922,11.862-2.511,12.645C712.255,336.006,711.958,336.4,709.75,336.4L709.75,336.4z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,388c-14.735,0-16.195-10.601-16.492-15.157
+ c-2.281,0.968-5.548,2.49-8.354,3.8C254.849,383.076,243.715,388,236.499,388c-25.961,0-44.166-21.61-49.72-41.423
+ c-0.496,1.275-1.103,2.539-1.847,3.778l-0.259,0.435l-0.314,0.393C176.217,361.363,147.782,362,147.5,362
+ c-13.223,0-22.925-3.37-28.833-10.014c-3.174-3.572-6.704-9.898-5.668-19.864c-0.076-13.164,4.078-39.976,7.319-50.778l1.604-5.345
+ h5.58H138.5h3.11l2.2,2.203c2.876,2.883,2.6,6.301,2.397,8.795c-0.186,2.297-0.532,6.568-0.213,15.227
+ c0.099,2.286,2.6,9.209,5.635,13.571c2.905-2.996,8.481-10.19,18.777-27.414c1.035-1.731,1.508-2.521,1.855-3.041l4.312-6.47
+ c-2.459-5.737-5.025-12.35-5.561-21.953L171,256.709V256.5c0-1.624,0.272-3.165,0.536-4.656c0.063-0.36,0.141-0.801,0.208-1.223
+ c-1.643-1.128-3.838-2.151-6.127-3.218c-2.111-0.98-4.292-1.997-6.398-3.256c-0.369-0.209-0.729-0.422-1.082-0.644
+ c0.54,1.213,0.862,2.522,0.862,3.996c0,3.947-4.782,14.335-8.793,22.354l-1.476,2.949l-3.169,0.907
+ c-4.74,1.354-14.83,1.837-22.691,1.837c-3.454,0-7.977-0.087-12.869-0.412v1.364c0,1.262,0.242,3.583,0.437,5.449
+ c0.242,2.332,0.392,3.825,0.392,5.05c0,9.626-4.898,16.854-13.795,20.355c-5.908,2.325-12.401,2.646-18.535,2.646
+ c-14.368,0-22.193-2.225-27.005-7.674c-4.93-5.588-4.942-12.66-4.958-20.851c-0.002-1.472-0.006-3.027-0.036-4.666
+ c-0.021-0.987,0.051-4.085,0.19-9.928c0.137-5.841,0.308-13.109,0.308-16.382v-21.002c-4.692-11.946-6.908-23.599-7.928-30.97
+ c-1.042-7.549,0.479-14.029,4.519-19.265c2.714-3.515,6.315-6.117,10.411-8.084v-3.68c0-4.226,0-8.548,0.348-12.964
+ c-0.274-0.091-0.552-0.181-0.833-0.272c-7.121-2.319-15.983-5.204-21.708-11.882C22.598,131.542,17,104.646,17,101.5
+ c0-9.415,5.693-15.501,14.501-15.501C40.835,85.999,46,94.573,46,100.5c0,2.351-0.814,5.752-2.543,12.424
+ c-0.538,2.081-1.261,4.873-1.453,5.927c0.13,5.004,3.026,8.388,5.463,10.36c3.112,2.516,7.279,4.158,11.751,4.679
+ C76.873,88.335,129.009,72,169.499,72c50.34,0,81.615,26.567,86.227,73.024C271.345,139.479,288.758,134,302.5,134
+ c10.265,0,22.501,4.945,22.501,28.5c0,26.976-14.824,65.562-47.938,90.953l-5.501,4.217l-4.637-5.153
+ c-6.05-6.723-13.757-10.396-24.253-11.562l-1.746-0.194c0.875,3.851,2.273,7.381,3.798,11.227
+ c1.421,3.591,2.943,7.431,4.067,11.781l0.006-0.036L259.498,278c6.913,9.213,14.501,33.549,14.501,46.5
+ c0,0.404-0.011,0.826-0.036,1.263c3.446-4.232,8.916-6.763,15.537-6.763c13.398,0,19.501,8.553,19.501,16.501
+ c0,3.262-1.63,6.604-4.312,11.722c-0.3,0.573-0.668,1.277-1.004,1.936c0.398,0.487,0.848,1.01,1.231,1.457
+ c3.22,3.751,8.084,9.422,8.084,16.884C313.001,379.377,304.8,388,293.5,388L293.5,388z M246.439,356.083
+ c-0.28,0.348-0.395,0.733-0.437,1.229C246.153,356.929,246.298,356.518,246.439,356.083L246.439,356.083z M270.056,335.941
+ c-1.21,1.355-2.773,2.583-4.78,3.574c1.535-0.104,3.14-0.207,4.789-0.296c-0.04-0.548-0.065-1.123-0.065-1.721
+ C270,336.973,270.019,336.451,270.056,335.941L270.056,335.941z M219.021,317.979c0.093,0.007,0.194,0.013,0.302,0.018
+ c0.586-0.089,1.986-0.42,2.938-0.646c0.477-0.114,0.957-0.226,1.438-0.338c-1.721,0.032-3.758,0.146-4.62,0.547
+ C219.059,317.655,219.036,317.792,219.021,317.979L219.021,317.979z M172.531,125.258c8.011,5.611,15.058,13.592,20.572,20.675
+ c2.554-14.033,4.928-23.67,8.842-29.011c-5.7,1.628-9.894,5.061-12.692,7.353c-2.444,1.999-4.553,3.726-7.753,3.726
+ c-2.045,0-3.8-0.7-6.71-1.858C174.111,125.874,173.352,125.572,172.531,125.258L172.531,125.258z"/>
+ <path fill="#1F1F1F" d="M169.5,79.5c36,0,75,15,79,69h-3c-5-28-16-40-37-40c-16,0-25,12-27,12s-12.5-6-23-6c-21,0-43,12-42,42
+ l-55,11c0-6,0-12,1-18c-7-3-19-5-25-12c-7.5-8.83-13-34-13-36c0-6,3-8,7-8c5,0,7,5,7,7c0,3-4,16-4,18
+ c0,13.355,12.737,23.069,27.8,23.069c0.728,0,1.463-0.023,2.2-0.069C79.5,93.5,134.5,79.5,169.5,79.5 M213.538,119.277
+ c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21c0,26-15,62-45,85c-9-10-20-13-29-14
+ c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1
+ c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7c2,0,18.01-9.73,21-10
+ c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8c10,0,11-19,11-20
+ c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13c-16,0-3-16-15-16
+ c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10c-4-4-10.5-5.83-15.5-8.83
+ c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19c-3.04,0.868-11.171,1.549-20.627,1.549
+ c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5
+ c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39c3.5-20.17,6.83-43.83,13-45
+ C211.555,119.349,212.566,119.277,213.538,119.277 M54.5,250.5c10.601,13.491,30.487,26.054,46.237,26.054
+ c0.594,0,1.182-0.018,1.763-0.054c0,3,0.83,8.5,0.83,10.5c0,15-15.83,15.5-24.83,15.5c-27,0-24.17-8.17-24.5-25.83
+ C53.96,274.67,54.5,256.5,54.5,250.5 M253.5,282.5c6,8,13,31,13,42c0,8-6,10-14,10c-7,0-7-9-7-13
+ C245.5,318.5,251.5,295.5,253.5,282.5 M138.5,283.5c1,1-0.59,3.01,0,19c0.17,4.5,4.83,17.17,11,22
+ c0.394,0.31,0.843,0.454,1.342,0.454c7.473,0,25.783-32.642,27.658-35.454l3,41c0,5,0,11-3,16c-4,5-22,8-31,8c-15,0-29-5-27-22
+ c-0.17-12.17,4-39,7-49H138.5 M169.5,64.5c-22.887,0-47.102,5.267-66.436,14.451c-22.318,10.602-38.762,26.385-48.174,46.081
+ c-2.892-1.323-4.917-3.379-5.317-5.69c0.286-1.215,0.786-3.146,1.146-4.539c1.934-7.468,2.781-11.077,2.781-14.302
+ c0-10.625-8.84-22-22-22c-12.953,0-22,9.458-22,23c0,5.403,4.153,19.196,4.33,19.781c3.642,12.041,7.645,20.522,12.238,25.93
+ l0.022,0.026l0.022,0.025c5.736,6.693,13.632,10.188,20.458,12.587c-0.062,2.329-0.068,4.619-0.069,6.88
+ c-3.33,2.099-6.335,4.699-8.847,7.953c-3.655,4.736-7.666,12.895-6.012,24.87c1.152,8.332,3.418,19.828,7.859,31.554V250.5
+ c0,3.184-0.17,10.403-0.307,16.204c-0.159,6.711-0.212,9.158-0.19,10.267c0.029,1.535,0.031,3.051,0.034,4.517
+ c0.015,8.896,0.031,18.094,6.835,25.802C53.794,316.263,66.235,317.5,78.5,317.5c6.544,0,14.191-0.376,21.283-3.167
+ c2.781-1.094,5.281-2.484,7.479-4.137c-1.056,8.09-1.759,15.938-1.766,21.561c-1.177,12.445,3.43,20.561,7.567,25.214
+ c7.394,8.313,18.98,12.529,34.438,12.529c5.904,0,13.821-0.954,20.661-2.489c6.875-1.543,12.2-3.518,16.228-6.052
+ c2.301,4.51,5.13,8.851,8.412,12.832C204.34,387.79,219.86,395.5,236.5,395.5c8.772,0,20.174-4.999,35.323-12.061
+ c0.02-0.009,0.04-0.019,0.06-0.028c0.447,0.926,0.981,1.858,1.621,2.783c2.932,4.245,8.782,9.306,19.996,9.306
+ c7.6,0,14.536-2.912,19.53-8.201c4.817-5.1,7.47-12.132,7.47-19.799c0-8.513-4.28-14.937-7.848-19.338
+ c2.113-4.158,3.848-8.218,3.848-12.662c0-11.927-9.274-24-27-24c-3.298,0-6.405,0.485-9.255,1.394
+ c-2.486-13.581-8.349-30.866-14.745-39.394l-9.87-13.16c-0.968-3.413-2.118-6.49-3.218-9.299c3.468,1.514,6.374,3.645,8.938,6.493
+ l9.274,10.305l11.002-8.435C316.77,232.461,332.5,191.32,332.5,162.5c0-5.601-0.454-13.9-4.378-21.287
+ c-5.04-9.488-14.14-14.713-25.622-14.713c-12.295,0-26.812,3.88-40.602,8.463c-1.801-9.966-4.853-19.031-9.12-27.063
+ c-5.635-10.608-13.4-19.48-23.079-26.371C214.048,70.389,193.232,64.5,169.5,64.5L169.5,64.5z M153.054,279.371l0.912-0.261
+ l2.951-5.902c1.771-3.542,3.868-8.042,5.472-11.744c0.449-1.035,0.853-1.989,1.216-2.875c0.6,8.093,2.501,14.303,4.513,19.443
+ l-2.098,3.147c-0.447,0.67-0.922,1.462-2.05,3.349c-4.393,7.349-7.831,12.719-10.507,16.642c-0.255-7.688,0.052-11.492,0.22-13.565
+ C153.833,285.754,154.081,282.688,153.054,279.371L153.054,279.371z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M445.01,377.502H416.6c-0.828,0-1.501-0.673-1.501-1.501v-67.812
+ c0-3.775-0.607-6.899-1.808-9.283c-2.233-4.446-6.292-6.605-12.412-6.605c-7.158,0-11.952,2.849-14.657,8.708
+ c-1.406,3.146-2.121,7.051-2.121,11.583v63.41c0,0.828-0.673,1.501-1.501,1.501h-27.8c-0.828,0-1.501-0.673-1.501-1.501v-63.33
+ c0-6.069-0.609-10.49-1.816-13.142c-2.1-4.593-6.162-6.828-12.414-6.828c-7.419,0-12.225,2.26-14.695,6.912
+ c-1.373,2.681-2.073,6.848-2.073,12.368v64.02c0,0.828-0.673,1.501-1.501,1.501h-28.202c-0.828,0-1.501-0.673-1.501-1.501V269.8
+ c0-0.828,0.673-1.501,1.501-1.501h27.001c0.828,0,1.501,0.673,1.501,1.501v10.492c2.533-3.545,4.988-6.237,7.326-8.03
+ c5.624-4.353,12.977-6.562,21.853-6.562c8.402,0,15.317,1.902,20.551,5.65c0.03,0.02,0.057,0.04,0.082,0.063
+ c3.509,2.895,6.334,6.504,8.422,10.749c3.508-5.25,7.753-9.242,12.649-11.891c5.95-3.04,12.626-4.572,19.875-4.572
+ c4.873,0,9.735,0.959,14.446,2.849c4.774,1.902,9.153,5.276,13.018,10.025c3.147,3.89,5.287,8.71,6.37,14.331
+ c0.668,3.688,1.007,9.069,1.007,16.015l-0.189,67.085C446.507,376.831,445.836,377.502,445.01,377.502L445.01,377.502z"/>
+ <path fill="#1F1F1F" d="M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58c3,3.711,5.02,8.271,6.06,13.67
+ c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959c-2.49-4.961-7.07-7.431-13.75-7.431
+ c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33c0-6.311-0.65-10.9-1.95-13.76
+ c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07V376h-28.2V269.8h27v15.46
+ c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37c4.02,3.32,7.08,7.58,9.15,12.779
+ c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M411.86,264.2c-7.485,0-14.391,1.587-20.523,4.718
+ c-0.022,0.011-0.043,0.022-0.065,0.034c-4.465,2.418-8.405,5.893-11.758,10.363c-2.029-3.501-4.587-6.534-7.643-9.058
+ c-0.053-0.045-0.108-0.087-0.164-0.127c-5.497-3.936-12.706-5.931-21.427-5.931c-9.215,0-16.878,2.313-22.776,6.877
+ c-1.614,1.238-3.242,2.832-4.904,4.808V269.8c0-1.657-1.343-3-3-3h-27c-1.657,0-3,1.343-3,3V376c0,1.657,1.343,3,3,3h28.2
+ c1.657,0,3-1.343,3-3v-64.02c0-5.276,0.646-9.214,1.92-11.703c2.165-4.076,6.539-6.077,13.35-6.077
+ c5.682,0,9.194,1.893,11.052,5.957c0.764,1.682,1.678,5.222,1.678,12.513V376c0,1.657,1.343,3,3,3h27.8c1.657,0,3-1.343,3-3v-63.41
+ c0-4.321,0.672-8.018,1.999-10.986c2.453-5.313,6.678-7.804,13.281-7.804c5.574,0,9.091,1.835,11.069,5.776
+ c1.097,2.176,1.651,5.072,1.651,8.613V376c0,1.657,1.343,3,3,3h28.41c1.653,0,2.996-1.338,3-2.991l0.19-67.08
+ c0-7.044-0.346-12.517-1.028-16.275c-1.136-5.897-3.381-10.94-6.679-15.02c-4.031-4.955-8.615-8.479-13.631-10.48
+ C421.97,265.194,416.922,264.2,411.86,264.2L411.86,264.2z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter" filterUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774" id="SVGID_1_">
+ <g filter="url(#Adobe_OpacityMaskFilter)">
+
+ <image overflow="visible" width="128" height="91" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAItAAADjQAABP//2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAFsAgAMBIgACEQEDEQH/
+xACNAAEAAgMBAQAAAAAAAAAAAAAABQcBBAYCAwEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQQCAwEB
+AAAAAAAAAwQBAgUGABAgERMwElAxFEAWEQABAwIEBAUEAwAAAAAAAAABABECIQMgMUESEFFhIjBx
+gTIEQJGhQlJiFBIBAAAAAAAAAAAAAAAAAAAAUP/aAAwDAQACEQMRAAAAr8GZad70qyHvKHKfdZzp
+qvewam91PYlQa1oVofICXiLCOv38ZGMj56MkITakR49hqVDclRECD6XBVlxm4AAAA8/M91ZavGlZ
+M4J+26rtU9cl0VaFjyNMWmSrGQDU4GxqyO7ia/1Dai/WCc7ist024jWHrrOR2y8fpEypljyZr7qq
+1IIAD15AAHV9PVosuF44b+gAAH//2gAIAQIAAQUA/If/2gAIAQMAAQUA/If/2gAIAQEAAQUA6Vra
+8p646zB9UdHVhRha3apiGmYcQOpbsiJmdX1z7wrjABpdIF4yWtLM1yulmFLGNdXn0m4tjHWbYXTJ
+mVsCAQ9hwI7hZBZc/XXcf/a5i0qLg6kCMkHwqpuf80n5BhVQ8oKlI5kBQRfZQ1Fkeuk42KirERHw
+sR5Dt8eMl0WH7T60rAVfiJHmm8LTRnpgQ+7JYwfrW+C1orA2wFn983LGwwC1ZpbmoBm761fqEl4H
+RzeFV3sdmAOVifPbkq2sshkzY3Jr5gVxZnJAJTKgHcn65pcxDILR6n2xUFsaYTFw+aYxjGGyg3Qd
+haxYe5qSIwNgbENjItsW9pOTMzzVmKhZYz1FlsptbbNyZBonLEtfml5a4yhJBB9bT4ru9qyLsRPI
+D5R+5R9cWzKzuEdqZfpctKRk80EI9izH9pe215t2RMxOC2iFqj3FX6s7utTju72vDuYccn/L/9oA
+CAECAgY/AEP/2gAIAQMCBj8AQ//aAAgBAQEGPwDgIxBJOQCEiNoK3Rr5hbb0DHrpi3CJjHRNcHbz
+wgDM5KN67F5SqgNoTGIR7AXRn8an9dE1y1KmoDr2S+xQFu0WOpDKNz5A3S6oR2gKXbop2pfqfxgB
+IeMD+VFg1MDSDqsQvYFSITRDcJPyUm/bP0wRuSFZVKAGnhS8l6Hjbt/ykAoUZh4ch0UbrasTxthn
+EaqI6eDukWATQkCeE2FRUIxkGILHgZaBgojojM6I/FJ7oljyHqgYyBfFIRzZXPjXpkwlIygZF8zU
+VKBJGSkDII3LWevCXmFGuilEkKV22wm+aEZyJtPXookF3GGQ6IfIt0lAu4Ww16omdwsdAm3FVUnN
+XBW4yZgpRslov7iu+bruX+acssn5ISGuAkqbYRJ2BoULYNDngt3HYOx9VGunF5FSAkEbcC4epxVw
+OMwo27p2kc1W4PumFwP5oi05KO+TROg+m//Z" transform="matrix(1 0 0 1 103 45)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_1_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,382c-9.998,0-10.315-5.942-10.546-10.279
+ c-0.217-4.07-0.465-5.721-4.453-5.721c-1.218,0-7.149,2.766-12.382,5.203C255.8,376.014,242.957,382,236.5,382
+ c-12.534,0-24.353-5.965-33.282-16.796C195.682,356.062,191,344.297,191,334.499v-21.89c-0.17-1.201-0.341-2.459-0.518-3.752
+ c-0.845-6.225-1.805-13.276-3.424-18.945c-1.138-4.55-2.757-8.294-4.324-11.914c-2.56-5.912-5.206-12.029-5.732-21.414
+ c-0.002-1.18,0.212-2.402,0.442-3.695c0.355-2.016,0.799-4.522-0.004-5.328c-2.376-2.377-5.892-4.014-9.292-5.598
+ c-1.994-0.93-4.056-1.889-5.919-3.005c-8.018-4.455-11.089-13.294-13.123-19.146c-0.37-1.066-0.69-1.987-0.997-2.755l-0.038-0.095
+ l-0.025-0.1c-0.816-3.267-2.352-5.857-5.008-9.474c-4.247,2.344-4.152,6.092-4.06,9.727c0.013,0.481,0.023,0.944,0.023,1.384
+ c0,11.657,6.152,18.462,10.225,22.965c2.191,2.423,3.775,4.175,3.775,6.034c0,3.166-8.077,19.509-8.159,19.671l-0.296,0.592
+ l-0.633,0.181c-3.363,0.961-11.819,1.606-21.042,1.606c-7.303,0-25.421-0.454-35.926-4.656
+ c-30.922-10.66-39.625-50.538-41.929-67.187c-0.814-5.892,0.305-10.864,3.325-14.776c6.96-9.015,22.775-10.902,35.482-12.418
+ c8.487-1.01,19.755-2.69,30.65-4.316c5.071-0.757,10.019-1.493,14.48-2.133c0.025-0.116,0.048-0.296,0.048-0.562
+ c0-1.51-0.598-4.632-1.125-7.385c-0.542-2.835-0.875-4.625-0.875-5.616v-6.001c0-11.356,13.95-20.5,25.5-20.5
+ c17.761,0,34.676,23.646,42.804,35.009c0.467,0.654,0.904,1.262,1.304,1.819c0.164-0.953,0.326-1.91,0.488-2.869
+ c4.085-24.071,7.006-38.771,13.125-39.933c1.174-0.168,2.268-0.248,3.317-0.248c16.308,0,21.873,18.76,25.937,32.459
+ c0.671,2.254,1.311,4.413,1.952,6.341c2.131-0.759,4.403-1.588,6.779-2.457C264.544,148.163,286.92,140,302.5,140
+ c16.501,0,16.501,16.934,16.501,22.5c0,25.503-14.097,62.045-45.589,86.19l-1.1,0.843l-0.928-1.03
+ c-6.994-7.771-16.168-12.191-28.05-13.513l-1.984-0.221l0.764-1.845c7.093-17.106,9.554-38.674,5.162-45.25
+ c-0.763-1.145-1.647-1.677-2.776-1.677c-0.789,0-1.146,0.278-1.346,0.486c-1.222,1.269-1.085,4.924-0.984,7.593
+ c0.074,1.938,0.139,3.62-0.208,4.779c-1.132,6.178-3.464,15.332-5.345,22.691c-1.271,4.979-2.585,10.13-2.617,10.963
+ c0,8.704,2.499,15.01,5.145,21.688c2.633,6.646,5.355,13.515,5.355,22.801c0,3.303-4.705,23.461-7.551,33.896l-0.417,1.529
+ l-1.504-0.501C232.255,311,227.348,311,225.499,311c-7.319,0-12.5,0.539-12.5,7.499c0,4.545,3.536,5.5,6.501,5.5
+ c0.724,0,2.461-0.41,4.142-0.808c2.474-0.585,5.031-1.19,6.857-1.19c3.014,0,7.5,1.731,7.5,6.5c0,5.946-5.555,7.321-10.456,8.535
+ c-5.938,1.47-9.543,2.707-9.543,7.465c0,5.075,2.224,5.5,4.5,5.5c0.845-0.146,5.368-2.56,8.67-4.322
+ c6.417-3.424,10.441-5.515,12.195-5.673c0.25-0.022,0.488-0.033,0.711-0.033c2.091,0,3.172,0.936,3.71,1.721
+ c1.59,2.315,0.269,5.939,0.114,6.346l-0.238,0.614l-0.61,0.241c-7.2,2.854-7.12,6.903-7.063,9.859
+ c0.006,0.263,0.011,0.511,0.011,0.746c0,4.068,2.289,6.5,4.499,6.5c8.643,0,9.501-18.314,9.501-18.5v-1.499h1.5
+ c2.734,0,5.946-0.217,9.348-0.444c3.719-0.248,7.553-0.507,11.48-0.551c0.231-1.382,0.072-2.827-0.097-4.339
+ c-0.113-1.024-0.231-2.083-0.231-3.166c0-9.228,7.274-12.5,13.502-12.5c9.963,0,13.5,5.655,13.5,10.5
+ c0,1.88-1.435,4.758-3.625,8.935c-0.976,1.864-2.313,4.413-2.376,5.091c0,1.074,1.71,3.068,3.363,4.997
+ c2.957,3.445,6.636,7.734,6.636,12.976C306.999,376.174,301.574,382,293.5,382L293.5,382z"/>
+ <g>
+ <path fill="#1F1F1F" d="M213.538,119.277c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21
+ c0,26-15,62-45,85c-9-10-20-13-29-14c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5
+ c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7
+ c2,0,18.01-9.73,21-10c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8
+ c10,0,11-19,11-20c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13
+ c-16,0-3-16-15-16c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10
+ c-4-4-10.5-5.83-15.5-8.83c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19
+ c-3.04,0.868-11.171,1.549-20.627,1.549c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66
+ C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39
+ c3.5-20.17,6.83-43.83,13-45C211.555,119.349,212.566,119.277,213.538,119.277 M213.538,116.277L213.538,116.277
+ c-1.121,0-2.285,0.085-3.462,0.253l-0.067,0.009l-0.067,0.013c-7.154,1.356-10.092,16.252-14.208,40.478
+ c-8.547-11.923-25.273-34.53-43.232-34.53c-6.25,0-12.861,2.322-18.139,6.37c-5.631,4.32-8.861,10.017-8.861,15.63v6
+ c0,1.128,0.326,2.887,0.902,5.898c0.415,2.168,0.916,4.785,1.058,6.364c-4.108,0.593-8.54,1.254-13.201,1.949
+ c-10.889,1.624-22.148,3.302-30.614,4.31c-12.988,1.551-29.15,3.481-36.493,12.993c-3.275,4.243-4.495,9.591-3.625,15.896
+ c1.349,9.753,4.34,24.19,10.932,37.593c7.76,15.777,18.523,26.143,31.994,30.81c10.756,4.273,29.043,4.736,36.418,4.736
+ c9.348,0,17.968-0.669,21.452-1.664l1.269-0.362l0.59-1.181c0.34-0.68,8.317-16.676,8.317-20.342c0-2.437-1.747-4.369-4.165-7.043
+ c-3.916-4.332-9.835-10.879-9.835-21.957c0-0.452-0.012-0.929-0.024-1.423c-0.087-3.454,0.041-5.904,2.188-7.644
+ c2.064,2.912,3.25,5.088,3.926,7.794l0.05,0.197l0.075,0.189c0.294,0.734,0.609,1.641,0.973,2.689
+ c1.976,5.687,5.281,15.197,13.81,19.963c1.919,1.147,4.002,2.118,6.018,3.057c3.399,1.584,6.611,3.08,8.799,5.234
+ c0.252,0.677-0.136,2.876-0.347,4.069c-0.23,1.3-0.467,2.645-0.467,3.873v0.084l0.005,0.084c0.54,9.651,3.24,15.891,5.851,21.924
+ c1.614,3.729,3.138,7.252,4.234,11.636l0.012,0.049l0.014,0.048c1.589,5.56,2.54,12.55,3.378,18.716
+ c0.172,1.267,0.34,2.497,0.507,3.673V334.5c0,10.129,4.813,22.26,12.56,31.658c9.218,11.183,21.45,17.342,34.44,17.342
+ c6.791,0,19.8-6.064,30.254-10.938c4.641-2.163,10.408-4.851,11.819-5.062c2.478,0.006,2.669,0.32,2.882,4.301
+ c0.219,4.089,0.626,11.699,12.044,11.699c8.832,0,15-6.579,15-16c0-5.797-3.88-10.319-6.997-13.953
+ c-1.082-1.262-2.686-3.131-2.97-3.964c0.292-0.864,1.411-2.999,2.171-4.449c2.362-4.507,3.796-7.404,3.796-9.634
+ c0-5.973-4.638-12-15-12c-9.112,0-15,5.495-15,14c0,1.166,0.123,2.267,0.241,3.331c0.107,0.968,0.207,1.864,0.204,2.7
+ c-3.537,0.083-7.038,0.317-10.199,0.529c-3.374,0.226-6.562,0.439-9.246,0.439h-2.961l-0.039,2.989
+ c-0.035,2.644-1.656,17.011-8,17.011c-1.21,0-3-1.589-3-5c0-0.244-0.005-0.503-0.01-0.775c-0.057-2.933-0.117-5.966,6.116-8.436
+ l1.223-0.484l0.472-1.228c0.302-0.785,1.707-4.846-0.276-7.733c-0.608-0.886-2.06-2.371-4.945-2.371
+ c-0.274,0-0.561,0.014-0.851,0.04c-1.974,0.178-5.405,1.917-12.763,5.842c-2.98,1.59-7.018,3.744-8.235,4.145
+ c-1.546-0.011-2.731-0.216-2.731-3.999c0-3.57,2.432-4.528,8.404-6.008c4.894-1.212,11.596-2.872,11.596-9.992
+ c0-5.252-4.527-8-9-8c-2.002,0-4.647,0.626-7.205,1.231c-1.293,0.307-3.246,0.769-3.795,0.769c-5,0-5-2.906-5-4
+ c0-5.094,2.882-6,11-6c1.611,0,6.513,0,9.051,0.846l3.009,1.003l0.834-3.06C240.998,301.743,246,280.698,246,277
+ c0-9.572-2.776-16.579-5.461-23.355c-2.583-6.521-5.024-12.68-5.039-21.068c0.119-1.052,1.42-6.151,2.57-10.657
+ c1.876-7.352,4.206-16.483,5.351-22.711c0.392-1.379,0.328-3.073,0.248-5.188c-0.054-1.437-0.219-5.81,0.57-6.5c0,0,0,0,0.001,0
+ c0.011,0,0.1-0.021,0.261-0.021c0.299,0,0.854,0,1.528,1.008c3.675,5.502,2.161,25.852-5.299,43.842l-1.53,3.69l3.97,0.44
+ c11.498,1.277,20.363,5.538,27.101,13.025l1.855,2.061l2.2-1.687c14.329-10.985,26.298-25.655,34.612-42.423
+ c7.457-15.037,11.562-31.003,11.562-44.958c0-5.936,0-24-18-24c-15.847,0-37.457,7.883-54.821,14.218
+ c-1.838,0.67-3.611,1.317-5.304,1.927c-0.479-1.517-0.963-3.148-1.464-4.836C236.714,135.658,230.964,116.277,213.538,116.277
+ L213.538,116.277z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_1_" filterUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223" id="SVGID_2_">
+ <g filter="url(#Adobe_OpacityMaskFilter_1_)">
+
+ <image overflow="visible" width="278" height="268" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAARTAAAJlwAADlr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAQwBFgMBIgACEQEDEQH/
+xACaAAEAAgMBAQAAAAAAAAAAAAAABgcDBAUBAgEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQMEAgEE
+AwEAAAAAAgMBBAUGACARExAwQBIxFBWAITM0IjI1FhEAAgIBAQYFAgUEAwEAAAAAAQIAESEDIDFB
+URIiEDBAYXGRE4GxMlIjocFCYuFyMwQSAQAAAAAAAAAAAAAAAAAAAID/2gAMAwEAAhEDEQAAAK/A
+AAAAPs+Hf7BCEqjprgAzdPrTsp7WtOtjVAAAAAAAAAAB7N4nbRubf16YI/J/kpblXDWJzPr52iy5
+VyeuYa5suOlRMuIAPreOekfSIUm8eOSAAAAADcuCmLhO0AD5i8qxlGb8v5pYG3jyDT3Pkprj27rF
+ed+fbpGOz0fTBk+xjjUp5RTzeHHMhjd7tEH+rK3yrNi19oqres3KQSbbHoAAB8fOUeegB4D0AADl
+dXglatIY7DidrDZ+x49AAAAAAAADz35OBwNWGl65+F3QADyGS2ryLvB3bZpi3zpAAAAeOEdfNT1j
+nbeegAADFl0yt4r1eYWzI+B3wB57iORU0qhQB92vUs4LH9+PsAAA8gU9hJW0yhvQLsycnqnoAAHD
+7cMK6y6fcLQ6mlug8Ee6FYHK1QAdLmi7OnXc/MwAAHG7OMo7Un0DJfP6Q7RcnsQlRlAB81xZFekC
+6vKFmyaju0XFqRThn3EffkAAA2LIq/aLxywKVnSYsh689Hjw5VU2PVZhBktyobWJQ89APIxKNApD
+563JAPv4AAAAAD66fKEw6tdC0c1Uelq6la+EhjwALKrWUlre4cwA+PvwraE2ZWYAAAAAAAAAAAAA
+2tUXP2YNOD0Dz34IdWc2hIAAAAAAAAAAAAABK7Rp23DaeaxtamnxiG8HZ1gAAAAAAAAAAAAADoXD
+TtwGSrrGp0+vnD6eAAAAAAAAAAAAAAA37gp63jfiMy4RCND65Bh8ABlxSYxa9p8Qq/zPgAAAAAAA
+AAAMtsVFNiya9n3GKd+5Z0iFa3Y4g++hPitpvKugZIHPa6IMAAAAAAAAAABt6gtuR0tY5IdfL9lP
+8KyYodGw4VjJxrVZoF687hSMqXky2JAAAAAAAAAAADb1BM+3WP0T+O8L5NrVADu9+B/Rv84AP//a
+AAgBAgABBQD+jL//2gAIAQMAAQUA/oy//9oACAEBAAEFAPiVqrLJ/wDzlmRtULFWfjqUxx0dWsP4
+GmB9bunmuLdGxULo1TF+QVYlfjzWBWasjSOnY+KAyZa1r49quOUoIUuONqKZGY15Tgy2EfRZ6LH7
+HqtSAREdosKhq9wxfaPi4oYO9gkCKfUhgozOHW9eZxTaL+YxXlu4JP0r+my0oaiyrw2PUFsZKMJf
+fyvp9lnE6SMcdpixHJ4N1L3MSUDfwhRNfoMYMdiwgWFX6TKT9ZT5chjl/RHpkUeVGz05rXhAjmrg
+r1maGlSXKOqIVCMPXXAVEhyFBHDSso2HHBKf14/kPaqlIWNdkpq9LlC0Nn1ybAahhLiXpD6L9CGC
+jL6xXyBVNQrJmviEJgErDqzYxKCGP5/phbJ4NG2fF4LIslWq3jlGlOKcfo6QZSqDWV1GsGQuupc+
+7my7VyKP5/ia7nlS1W0/lbSA7I02uMK1auPF6/WHgYmuPBooHgoUPIEY97v25BDPsbG6Ar+aP5Kn
+VK0/A68sARj0qGFhHO0fE2HPDjk4fdP2rFWwL1dMz2jb7sAj7T9tVUJ2scoQT8U57DvbJkaxkuxr
+b5ZW6bTIWrcL3kZzVGwFygX2R7JFAx+2n7RMFHsvL6q3V4kxX+TV/wDW6c9eFKcnZmzb5hH+G/h3
+Qyv7Ow5T9NC9rvxcwWVG2n2ck3xo2Sz5r6Bk360uRrdFhsKXt+W/t6JOVt1e3DEexP43k5/X5peR
+IeJODX7Gw2IXXut81rEpl1/CK+lf1mYiNgyoIVkbhW7PrpeQ/wCCjgw65/G61SOvzC3Jq3cNdFye
+ufxuVvx15mZnV0fa3jfrCfXKZAK6tkzJWndGDvTUuYe6L0+xnqUWK+TqFUtxMxOs7DAcpZNTwgoK
+Ok/+u9sKB5iMkunOJ2ZBRWySXRBhMXb60hs+fI5mZKeiJmJ1PN9xruFodblwwNswXkgwJZCZAWN2
+W1UnC7SmzCXC4Ogv7jvNeSV6Aw1ljdmtVSr7OJqzWzkcMYbD6qVtlR+vZ8HLS4Gj15pYSrOisbfo
+h7a7NXtm+r07VT8tdgStnqDmBEzMz7FDIOpMwm1LZFXLJbAvWfIKJ6CKBjYsgIJuPl9j0X/k1WYi
+v05WvDUbFTmtd94DMCp7BdrTU3SR5X3RBcHca3A22sUM22uPH7fXkc7nf2o9YntOn24NET3joaP2
+XulKIH4cEQ8kiLr06/421WQxXRP43Bcfr/LxtqatvA3IfX6J/G4tiK/zNLvSxET3j1YX1Dd7UyPz
+NKsyLUF9let90LTtVry2/mas2V36B/ZH44++hPGZ6vHMrnFmvIv89v5mDKRyOJnvXyVr9dGc2S06
+zN+5PJt2S5M95+Zhf/Qw/wDr7Aozq21GqzztPzsL/wChh/8AXekXBmdarNJmDrom3WSIlEQXRXrs
+sMRq7DC7r7a8EMjPxMPPa/hSia/M/fVWXkdg8putub1alUFxV8cEKzyFrXckZs/ErM8VjWrcMRP4
+302Qri1MZMUCGGiIl2meCppTFC4XNIxtha+31XueQ8ITMzPxdPyv9kMhi8/hAyCo0ZgtXra6q86f
+gZ+eYOn+zYx+upIVYGsPEVVIg47ju+Naz4+NulTs4DMLeoSEx8YcuVxJO2IJd/mp0pCKrVLW7K11
+cDYKpGl4OHMUQerP4/8AUs/GwuZOgzD59TwVYWyD+shs2GVchWBhTatlVQLm1Aobuw3LMjcsizVs
+wTq9myBK2wgkfj0sjZpljdwiIXtaTG9sKCG3nQmX5Cw7kzM+uCysVodsQeLLZGbjPkj5OF5OqO/e
+fJ29f//aAAgBAgIGPwAZf//aAAgBAwIGPwAZf//aAAgBAQEGPwD0nQg+TOoE/SfyLjn6gJpi2MB1
+Lo8BMpmE6dgzp1Vxz2RqMMtmCxG7Y2mR232+mCLvJoRXZbY5JMGJulERqUG4zAE6d/TxVeZAiY4C
+VCCI2qq5XPptMGKa4bFGN23cY1/GT9PDSX3uL8eL43iPp/tONikUsfYQUnSDzgLk+4EtgT8w0kLL
+ZUbx5mmTzqL8bJBjdt3G0mBr/EwGr6azF+PFh7QtVB5SgseQgpOkHnAdW2+YOwfSDtEws3SiIxrh
+PsVjrqvL02G8MIhPLaKkRm017t4qM/8A9Gn0d2PwgXxIPGXqIGo2IKQCvaDtEwNpviIP9v7HawhP
+4GDp0mz7QD7dA8Z3YHsJ3kmKzr1UQRed0CDgNumFy1WvOb4iHh1f2Ph06SljAdSwOQnepPzAPtjH
+tB2D6T9In6RP0iYWYHn4PkN8T7vD7n/EXSXjvikrBgTA9Kz3u4T7epaEnAPGBhtEx88DOrjdw3zE
+FDh6Yyv9h+c03XeGES+W0TPtA7znwKnjRi/HlWTQnT1C5Yz5TGBOJMT/ALD84nwNps1iO92AaHgh
+ug2Ivx5TMDVCfcZv4i27kIpu7HlN8Qi7CzTUbywiXy2SxjaaNlsDxRx/iQYmeA8kxxw8Bosf0moD
+5LZ4TUe7tjU0l5G4vxsWY3dVCNqE2t9uwumxyuICPJ1K5HwVrpWwYueHkvngZZ3mfcO4YEAHLYOa
+jaKHHE7K5pWOfmLnh5LCrsR9MigSSssbxF0tRqYc4O4Swb2jKB3nPgOrHvAvWPrBTCXcOYdLSbuM
+JJsnedmxvG6Lps3cuDAQfIKmNqIveMgwo4phvEDIaYbiIBqEso4iKOsXygZTsmM37Tf08epGKnmI
+q6p6l5wHq4RtPSa2MLubY7ztrqIaF9wijqgIPkNfKHp35vxGppMVYHhxiF95A2nxwMZDvUkbBCsQ
+DwlnJ8kOhPTxWBWajxBg7hMGYOxZMbPCPqHiceK/I/OIByG02OELcH/Pz+pCVPMTJ6hANQlT7yi4
++s/9B9Zhx9Zlx9YQNQfWFNNrvYsbxEzeBAdkiM4GVN+kwSPiZJPzt/ZY7jj4gO059j6xNQbrAMXO
+8bTj2PrUBOaowHYJhQcTXrTp8AfzinYOeECXus+tq8Govx4dzCYYRgrR3969bp1F+Ize0fT0WpVN
+EzOs07tQmWfW6cX4jheU1EcUwY/1Phu9dpxfiFWhcoLhpRCMQgbtkJpizxMtruFlvHAwqcEb/S6Z
+i/HgzMaqEaORz4TuOOW11EWbgxwjYj9O6/S6b8iImeHgQDQJAP18KQXL1Me0oTEpUJJ9pjRY/hOr
+WQoSTgz4EZQe44Es7z6ZdNjlcGAiMpF3MsxS90wtVPtJgnwyLAxASggtRKQVCJ91QT0G69OuoD23
+3Re67EsZE3RqHCAkdpsX4DUcUWNwXMsJ0dYuWpuNYuxCyilY59OFY/x3v5Re4G5YMIuHnvBEvUPU
+BwMAsCoQrWeQhCsUX+sGqNVuoG95iFzmsw54Rq3+oB02PT+2BdRuk+8/WPrCeoQ/byfaV1dI9pZy
+fEIxqp+rhKBtR6rsv8Lndde97WN8zde97H//2Q==" transform="matrix(1 0 0 1 43 116)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_2_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#CEBC01" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+</g>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M193.772,206.837c-5.358,0-10.236-2.729-13.736-7.683l-0.198-0.28
+ l-0.093-0.33c-8.547-30.246-25.982-48.151-39.992-62.539c-2.949-3.03-5.736-5.89-8.24-8.667l-0.94-1.043l0.662-1.238
+ c3.588-6.719,10.431-10.272,19.783-10.272c5.169,0,10.029,1.066,13.196,1.96c2.665,0.75,5.5,1.129,8.429,1.129
+ c0.004,0,0.006,0,0.01,0c7.256,0,14.981-2.283,22.334-6.601c2.978-1.746,6.236-2.632,9.686-2.632
+ c6.564,0,11.543,3.219,11.753,3.357l1.181,0.775l-0.336,1.373c-4.887,19.923-7.7,46.495-8.604,81.235l-0.006,0.27l-0.078,0.255
+ C206.643,202.342,200.553,206.835,193.772,206.837L193.772,206.837z"/>
+ <path fill="#917013" d="M204.676,110.643c6.042,0,10.654,3.027,10.654,3.027c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-1.729,5.729-7.115,9.506-12.899,9.506c-4.249,0-8.713-2.037-12.101-6.836c-10.51-37.2-34.41-56.19-48.67-72
+ c3.897-7.297,11.292-9.214,18.019-9.214c5.322,0,10.226,1.199,12.651,1.884c2.928,0.824,5.941,1.206,8.975,1.206
+ c8.011,0,16.174-2.662,23.355-6.876C198.988,111.248,201.975,110.643,204.676,110.643 M204.677,106.643L204.677,106.643
+ c-3.812,0-7.412,0.979-10.701,2.907c-7.053,4.139-14.428,6.327-21.332,6.327c-2.745,0-5.4-0.355-7.892-1.057
+ c-3.285-0.927-8.337-2.033-13.734-2.033c-10.138,0-17.589,3.917-21.547,11.33l-1.323,2.478l1.881,2.086
+ c2.528,2.803,5.326,5.676,8.289,8.718c13.853,14.225,31.094,31.929,39.502,61.69l0.187,0.659l0.396,0.561
+ c3.883,5.5,9.342,8.528,15.369,8.528c7.655,0,14.534-5.078,16.729-12.35l0.155-0.515l0.014-0.537
+ c0.889-34.117,3.764-61.306,8.546-80.812l0.673-2.746l-2.363-1.551C217.296,110.176,211.832,106.643,204.677,106.643
+ L204.677,106.643z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_2_" filterUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193" id="SVGID_3_">
+ <g filter="url(#Adobe_OpacityMaskFilter_2_)">
+
+ <image overflow="visible" width="87" height="99" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAIPAAADBQAAA/v/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAGMAVwMBIgACEQEDEQH/
+xACPAAEAAgMBAQAAAAAAAAAAAAAABgcCAwUBBAEBAAAAAAAAAAAAAAAAAAAAABAAAQQBAwMDBQEA
+AAAAAAAAAwECBAYFABAgETESUCETMDIjMxQ0EQACAQEGAwgDAQAAAAAAAAABAgARECAhMUEDcRIi
+MFFhgZGhMkJigrITEgEAAAAAAAAAAAAAAAAAAABQ/9oADAMBAAIRAxEAAACv2ySEXWJ8xBEowI1n
+MZGQLbaXOKmfaNVkVRIS3Ped0jW2jDL0OH24uVm+YYgk1lUhMSzffm+kA8hE2rwggAGeAsia0lbB
+2HnphWlk1YRcAACawr7i7tnJ6xpqi1anI+AAACxJvS0zJXU0ihhpAAAA2BjiAH//2gAIAQIAAQUA
+9K//2gAIAQMAAQUA9K//2gAIAQEAAQUA5iCUzolalGSTWXiaSK8ZwAed+Oq7TIyoBVkmkjVCUuQj
+kpkpVh0j3gVUAdCxYRtzEQYxS3IuZxUhgj4MgSNY1nirGLpY4l1/MLSDY3exERkd5PLJ6r+efGLi
+8kOSPlbDeEfz/JtWs+QBMdPZIHwXtdJHhH3RVatWsDmrEktOPd/23cifFwCV4SVTOIcY3o9uxPZl
+4d15YbIOhSsJkGyA7SF6CuhXKflTcu7QSIQepX6bj/q5YeUsWbhJaGBqYvQFtIjpnJFVFqOU8gjM
+x7clIY0Nkej5/PEZR0EsWzj+PKWZijlSHSDfQH2J32//2gAIAQICBj8AK//aAAgBAwIGPwAr/9oA
+CAEBAQY/AL/LtqWPhAz1A7hKioMXZObMFHmaQInmYC45ie+U5B6Q8q0PhDysaT5H0gO6C3GDoA8p
+QARjTSbQ0G4n9CAPqc4tKQUExE+M+MwFrcINyuH+qmvAixdrdbDQwY1rffgZz/lze9bRs7rYaEwY
+1umPwNwMpoRkYuzut1CAg3DGBOeF1dxDRlNYqserIiBhraZT8heU16GIBi41qLWgXQm+Nl26lwgY
+WNF4m+jaMaGLjpY0C61JvgjMZRAxxgNYwrpCR49gAT0EwdfvCA2cbcbXLsfv+s+37W//2Q==" transform="matrix(1 0 0 1 131 108)">
+ </image>
+ </g>
+ </mask>
+ <g opacity="0.6" mask="url(#SVGID_3_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#7F3E03" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M210.936,113.796
+ c-11.983,64.227-22.738,60.791-73.726,11.721c0.148-11.045,22.734-5.193,27.431-4c9.14,2.331,19.844,0.864,27.954-4.462
+ C202.85,110.315,210.936,113.796,210.936,113.796z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M166.77,188.01c5.25,0.61,8.37,11.49,9.67,19.44c1.33,8.17,1.33,16.76-4.05,17.47
+ c-8.06,1.08-11.67-21.93-11.67-21.93C158.28,187.29,166.77,188.01,166.77,188.01z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M229.86,192.56c0.99,10.209-3.431,23.959-6.57,24.39
+ c-6.29,0.85-7.51-9.05-7.72-10.7c-0.41-3.3-3.061-24.76,7.939-26.25C228.33,182,229.45,189.26,229.86,192.56z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M216.51,195.85c0.93-8.26,11.79-5.08,11.79,2.86
+ c0,7.95-2.1,14.261-4.34,16.21C217.75,220.32,215.58,204.12,216.51,195.85z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M163.09,206.33c-1.19-8.13,9.59-8.43,11.57-0.891
+ c1.97,7.551,1.6,14.181,0.02,16.721C170.3,229.18,164.28,214.45,163.09,206.33z"/>
+<rect x="701" y="306" fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" stroke="#1F1F1F" stroke-width="20" stroke-linecap="round" stroke-linejoin="round" width="14" height="34"/>
+<circle fill-rule="evenodd" clip-rule="evenodd" fill="#FFFF33" cx="182.5" cy="139.5" r="11.5"/>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_3_" filterUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33" id="SVGID_4_">
+ <g filter="url(#Adobe_OpacityMaskFilter_3_)">
+
+ <image overflow="visible" width="39" height="35" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGnAAAB+QAAAmr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIACMAJwMBIgACEQEDEQH/
+xAB9AAEAAgMBAAAAAAAAAAAAAAAABgcBBAUDAQEAAAAAAAAAAAAAAAAAAAAAEAACAwEAAwEBAAAA
+AAAAAAADBAECBQYQMBEAExEBAAIBAwMDBQAAAAAAAAAAAQACETFBAxBxEiGBkcEiMhMEEgEAAAAA
+AAAAAAAAAAAAAAAw/9oADAMBAAIRAxEAAACAdvxtYgHEurklMuyNm1aPm5YOlHo4aqPjzBnAAf/a
+AAgBAgABBQD0/wD/2gAIAQMAAQUA9P8A/9oACAEBAAEFAIibTncyy3BOKvFH8NxOfk/edThlzMzx
+CDIRzGvlhIJ7PgO1yJKUZSJW4f2kwMYdRql91Nu6h8rrhQMnYLRXY67+1bHJY/ifP//aAAgBAgIG
+PwAf/9oACAEDAgY/AB//2gAIAQEBBj8AAMroQtfIOxM1yMVq2qb7zG8GxkrKvjtMeJLPiaTg4g+3
+l5aVx3sER1zK4elhdp/JjSvPxq9rkOWm2pAvfCajPzPmWpwvks/eubli3uevU+vX/9k=" transform="matrix(1 0 0 1 114 111)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_4_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_4_" filterUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239" id="SVGID_5_">
+ <g filter="url(#Adobe_OpacityMaskFilter_4_)">
+
+ <image overflow="visible" width="34" height="31" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGWAAAB3QAAAkb/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAB8AIgMBIgACEQEDEQH/
+xAB4AAADAQEAAAAAAAAAAAAAAAAABQcGAwEBAAAAAAAAAAAAAAAAAAAAABAAAgIDAQEAAAAAAAAA
+AAAAAgMEBQABBiASEQACAQMDAwUAAAAAAAAAAAABAgAREgMQITFRsQRBcdEiYhIBAAAAAAAAAAAA
+AAAAAAAAIP/aAAwDAQACEQMRAAAAwTkqRLU1vnZkQBrUoy5KrPV6Y5gH/9oACAECAAEFAPX/2gAI
+AQMAAQUA9f/aAAgBAQABBQBSjccbl5Tgk8tMSLksSecugGya+CnSpUBJr6ysBesoJuosystUkmVa
+IBfU2i2awfr6iTrxYSLC/MH7cR5//9oACAECAgY/AF//2gAIAQMCBj8AX//aAAgBAQEGPwAJjFWM
+DEkE9BLlNfcQpkFrDQ3DgiA0h2EbIg+y76C40Dd4tWHENGEZFNSdhoLa3elOYBi8fK46hGPYSj+P
+mQdTjf4hOe6/9Cmn/9k=" transform="matrix(1 0 0 1 202 101)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_5_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M116,85c4-22.67,16.33-29.33,23.67-27.67
+ c7.33,1.67,20,11,30,11c12.33,0,16.66-3,23.66-8.66c7-5.67,10.31,2.33,10,12.33C203,83,207,91.67,204,92s-10.67-18-19-11
+ c-5.33,10.67-2,25.67-12.33,27c-6.7,0.86-21.67-3.67-35-19c-3.07-3.52-12-6-15,1c-3.33,7.75-3.34,4.67-5,8
+ C116.61,100.11,114.86,91.45,116,85z"/>
+<g>
+ <g>
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_5_" filterUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55" id="SVGID_6_">
+ <g filter="url(#Adobe_OpacityMaskFilter_5_)">
+
+ <image overflow="visible" width="60" height="60" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAHLAAACZwAAAyD/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIADwAPAMBIgACEQEDEQH/
+xACFAAACAwEBAQAAAAAAAAAAAAAABwIFBgQBAwEBAAAAAAAAAAAAAAAAAAAAABAAAQQBBAMBAAAA
+AAAAAAAAAgEDBAYFABARFCBAExIRAAEDAgQFBAMAAAAAAAAAAAEAEQJBEiAhMQMQUXGRImGhwWKx
+MhMSAQAAAAAAAAAAAAAAAAAAAED/2gAMAwEAAhEDEQAAAF/6bAorJk9gpKZ5Z8UxYV5aNtbNU+no
+BGQYVdN9TFy2Ua0TUEZB4cpQqvS5cO7hBi3ag+w0chmYEogf/9oACAECAAEFAPQ//9oACAEDAAEF
+APQ//9oACAEBAAEFANIiksKvzpWhpcpUkVGY0MmFIilsiKS1qtfXUPFMMAjDSaciMuJmq4xIby+M
+PHyNV+F2p2KhgwxuYoQ3HFibPC80sUWUwnDXhZwRY34XuVGQLUyI4jjPha5YhH/afaFJKLIrmbbf
+ZAxNNps1thu15rsObY3KyIDmKuDJiNnjKMq2RwHM2w5GnDNw9055HucH9uN//9oACAECAgY/AAf/
+2gAIAQMCBj8AB//aAAgBAQEGPwBAAOToEDbbE909x7ImJJPqFbvQI9acQAHJ0Cjvb0Xkc86IC0L9
+QmMQpeALoxY2HQ8uEXDxj+VFhTAQaqcgMxmFbXRlJ+YUemGfRW/f5RiTmSCokcsMw9Cr6XXe7qG9
+Ghz6KHlqE8S/EknNS2ISd9enEGBeD5hASmx5FPeESJjujDYLvWiM5l5HU4PHWjI2/wBGrqvO5vs/
+zg//2Q==" transform="matrix(1 0 0 1 139 -1)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_6_)">
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M149,22.33c13.33-26.66,39.67-9,40.67,3.34
+ C190.67,38,141.58,37.17,149,22.33z"/>
+<rect x="337.5" y="105.5" fill-rule="evenodd" clip-rule="evenodd" fill="none" width="764" height="167"/>
+<text transform="matrix(1 0 0 1 337.5 191.7793)" fill="#1F1F1F" font-family="'Helvetica-Bold'" font-size="120" letter-spacing="-6">Powered by</text>
+</svg>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/src/images/logos/favicon.ico
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/favicon.ico b/community/mahout-mr/src/images/logos/favicon.ico
new file mode 100644
index 0000000..4f5878d
Binary files /dev/null and b/community/mahout-mr/src/images/logos/favicon.ico differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/favicon128.png b/community/mahout-mr/src/images/logos/favicon128.png
new file mode 100644
index 0000000..a477d15
Binary files /dev/null and b/community/mahout-mr/src/images/logos/favicon128.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/favicon16.png b/community/mahout-mr/src/images/logos/favicon16.png
new file mode 100644
index 0000000..595b237
Binary files /dev/null and b/community/mahout-mr/src/images/logos/favicon16.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/favicon32.png b/community/mahout-mr/src/images/logos/favicon32.png
new file mode 100644
index 0000000..39668fd
Binary files /dev/null and b/community/mahout-mr/src/images/logos/favicon32.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/favicon64.png b/community/mahout-mr/src/images/logos/favicon64.png
new file mode 100644
index 0000000..5032b12
Binary files /dev/null and b/community/mahout-mr/src/images/logos/favicon64.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo-100.png b/community/mahout-mr/src/images/logos/mahout-logo-100.png
new file mode 100644
index 0000000..9868200
Binary files /dev/null and b/community/mahout-mr/src/images/logos/mahout-logo-100.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo-200.png b/community/mahout-mr/src/images/logos/mahout-logo-200.png
new file mode 100644
index 0000000..4ef5bdd
Binary files /dev/null and b/community/mahout-mr/src/images/logos/mahout-logo-200.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo-300.png b/community/mahout-mr/src/images/logos/mahout-logo-300.png
new file mode 100644
index 0000000..2fbd589
Binary files /dev/null and b/community/mahout-mr/src/images/logos/mahout-logo-300.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo-400.png b/community/mahout-mr/src/images/logos/mahout-logo-400.png
new file mode 100644
index 0000000..d9ac832
Binary files /dev/null and b/community/mahout-mr/src/images/logos/mahout-logo-400.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo-poweredby-100.png b/community/mahout-mr/src/images/logos/mahout-logo-poweredby-100.png
new file mode 100644
index 0000000..8f8af00
Binary files /dev/null and b/community/mahout-mr/src/images/logos/mahout-logo-poweredby-100.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo-poweredby-55.png b/community/mahout-mr/src/images/logos/mahout-logo-poweredby-55.png
new file mode 100644
index 0000000..9814d31
Binary files /dev/null and b/community/mahout-mr/src/images/logos/mahout-logo-poweredby-55.png differ

Loading Image...

----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/images/logos/mahout-logo-transparent-400.png b/community/mahout-mr/src/images/logos/mahout-logo-transparent-400.png
new file mode 100644
index 0000000..583436b
Binary files /dev/null and b/community/mahout-mr/src/images/logos/mahout-logo-transparent-400.png differ

r***@apache.org

2018-06-27 14:52:04 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/distribution/src/main/assembly/scala-2.11_spark-2.0.xml
----------------------------------------------------------------------
diff --git a/distribution/src/main/assembly/scala-2.11_spark-2.0.xml b/distribution/src/main/assembly/scala-2.11_spark-2.0.xml
deleted file mode 100644
index 5c6b646..0000000
--- a/distribution/src/main/assembly/scala-2.11_spark-2.0.xml
+++ /dev/null
@@ -1,249 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
- <id>scala-2.11_spark-2.0</id>
- <formats>
- <format>dir</format>
- <format>tar.gz</format>
- </formats>
-
- <fileSets>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>*.jar</include>
- </includes>
- <excludes>
- <exclude>mahout-*</exclude>
- <exclude>hadoop-*</exclude>
- <exclude>junit-*</exclude>
- </excludes>
- <outputDirectory>lib</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>mahout-collections*.jar</include>
- </includes>
- <outputDirectory>lib</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>hadoop-*.jar</include>
- </includes>
- <outputDirectory>lib/hadoop</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../hdfs/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../mr/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../integration/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- <include>mahout-*.job</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math-scala/target</directory>
- <includes>
- <include>mahout-*2.11*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../spark/target</directory>
- <includes>
- <include>mahout-*2.11*spark_2.0.jar</include>
- <include>mahout-*2.11*dependency-reduced.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../flink/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- <include>mahout-*.job</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../viennacl/target</directory>
- <includes>
- <include>mahout-*2.11*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../viennacl-omp/target</directory>
- <includes>
- <include>mahout-*2.11*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../collections/target/apidocs</directory>
- <outputDirectory>docs/mahout-collections</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math/target/apidocs</directory>
- <outputDirectory>docs/mahout-math</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../hdfs/target/apidocs</directory>
- <outputDirectory>docs/mahout-hdfs</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../mr/target/apidocs</directory>
- <outputDirectory>docs/mahout-mr</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../integration/target/apidocs</directory>
- <outputDirectory>docs/mahout-integration</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/apidocs</directory>
- <outputDirectory>docs/mahout-examples</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math-scala/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-math-scala</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../spark/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-spark</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/..</directory>
- <outputDirectory/>
- <useDefaultExcludes>true</useDefaultExcludes>
- <includes>
- <include>**/README*</include>
- <include>**/LICENSE*</include>
- <include>**/NOTICE*</include>
- <include>**/*.properties</include>
- </includes>
- <excludes>
- <exclude>**/target/**</exclude>
- </excludes>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../bin</directory>
- <outputDirectory>bin</outputDirectory>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../src/conf</directory>
- <outputDirectory>conf</outputDirectory>
- <fileMode>0644</fileMode>
- <directoryMode>0755</directoryMode>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/bin</directory>
- <outputDirectory>examples/bin</outputDirectory>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- <excludes>
- <exclude>work</exclude>
- <exclude>work/**</exclude>
- </excludes>
- </fileSet>
- </fileSets>
-</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/distribution/src/main/assembly/scala-2.11_spark-2.1.xml
----------------------------------------------------------------------
diff --git a/distribution/src/main/assembly/scala-2.11_spark-2.1.xml b/distribution/src/main/assembly/scala-2.11_spark-2.1.xml
deleted file mode 100644
index 7c614ec..0000000
--- a/distribution/src/main/assembly/scala-2.11_spark-2.1.xml
+++ /dev/null
@@ -1,249 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
- <id>scala-2.11_spark-2.1</id>
- <formats>
- <format>dir</format>
- <format>tar.gz</format>
- </formats>
-
- <fileSets>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>*.jar</include>
- </includes>
- <excludes>
- <exclude>mahout-*</exclude>
- <exclude>hadoop-*</exclude>
- <exclude>junit-*</exclude>
- </excludes>
- <outputDirectory>lib</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>mahout-collections*.jar</include>
- </includes>
- <outputDirectory>lib</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/dependency</directory>
- <includes>
- <include>hadoop-*.jar</include>
- </includes>
- <outputDirectory>lib/hadoop</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../hdfs/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../mr/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../integration/target</directory>
- <includes>
- <include>mahout-*.job</include>
- <include>mahout-*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- <include>mahout-*.job</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math-scala/target</directory>
- <includes>
- <include>mahout-*2.11*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../spark/target</directory>
- <includes>
- <include>mahout-*2.11*spark_2.1.jar</include>
- <include>mahout-*2.11*dependency-reduced.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../flink/target</directory>
- <includes>
- <include>mahout-*.jar</include>
- <include>mahout-*.job</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../viennacl/target</directory>
- <includes>
- <include>mahout-*2.11*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../viennacl-omp/target</directory>
- <includes>
- <include>mahout-*2.11*.jar</include>
- </includes>
- <excludes>
- <exclude>*sources.jar</exclude>
- <exclude>*javadoc.jar</exclude>
- <exclude>*tests.jar</exclude>
- </excludes>
- <outputDirectory/>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../collections/target/apidocs</directory>
- <outputDirectory>docs/mahout-collections</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math/target/apidocs</directory>
- <outputDirectory>docs/mahout-math</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../hdfs/target/apidocs</directory>
- <outputDirectory>docs/mahout-hdfs</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../mr/target/apidocs</directory>
- <outputDirectory>docs/mahout-mr</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../integration/target/apidocs</directory>
- <outputDirectory>docs/mahout-integration</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/target/apidocs</directory>
- <outputDirectory>docs/mahout-examples</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../math-scala/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-math-scala</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../spark/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-spark</outputDirectory>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/..</directory>
- <outputDirectory/>
- <useDefaultExcludes>true</useDefaultExcludes>
- <includes>
- <include>**/README*</include>
- <include>**/LICENSE*</include>
- <include>**/NOTICE*</include>
- <include>**/*.properties</include>
- </includes>
- <excludes>
- <exclude>**/target/**</exclude>
- </excludes>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../bin</directory>
- <outputDirectory>bin</outputDirectory>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../src/conf</directory>
- <outputDirectory>conf</outputDirectory>
- <fileMode>0644</fileMode>
- <directoryMode>0755</directoryMode>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/bin</directory>
- <outputDirectory>examples/bin</outputDirectory>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- <excludes>
- <exclude>work</exclude>
- <exclude>work/**</exclude>
- </excludes>
- </fileSet>
- </fileSets>
-</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/distribution/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/distribution/src/main/assembly/src.xml b/distribution/src/main/assembly/src.xml
deleted file mode 100644
index 0bb8e8b..0000000
--- a/distribution/src/main/assembly/src.xml
+++ /dev/null
@@ -1,64 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
- <id>src</id>
- <formats>
- <format>dir</format>
- <format>tar.gz</format>
- </formats>
- <fileSets>
- <fileSet>
- <directory>${project.basedir}/..</directory>
- <outputDirectory/>
- <useDefaultExcludes>true</useDefaultExcludes>
- <includes>
- <include>**/README*</include>
- <include>**/LICENSE*</include>
- <include>**/NOTICE*</include>
- <include>**/pom.xml</include>
- <include>**/src/**</include>
- <include>src/conf/**</include>
- <include>**/build.xml</include>
- <include>**/*.properties</include>
- </includes>
- <excludes>
- <exclude>**/target/**</exclude>
- </excludes>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../bin</directory>
- <outputDirectory>bin</outputDirectory>
- <useDefaultExcludes>true</useDefaultExcludes>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- </fileSet>
- <fileSet>
- <directory>${project.basedir}/../examples/bin</directory>
- <outputDirectory>examples/bin</outputDirectory>
- <useDefaultExcludes>true</useDefaultExcludes>
- <fileMode>0755</fileMode>
- <directoryMode>0755</directoryMode>
- <excludes>
- <exclude>work</exclude>
- <exclude>work/**</exclude>
- </excludes>
- </fileSet>
- </fileSets>
-</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/pom.xml
----------------------------------------------------------------------
diff --git a/hdfs/pom.xml b/hdfs/pom.xml
deleted file mode 100644
index 2d909a2..0000000
--- a/hdfs/pom.xml
+++ /dev/null
@@ -1,246 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout</artifactId>
- <version>0.13.1-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- 
- <artifactId>mahout-hdfs</artifactId>
- <name>Mahout HDFS</name>
- <description>Scalable machine learning libraries</description>
-
- <packaging>jar</packaging>
-
- <build>
- <resources>
- <resource>
- <directory>src/main/resources</directory>
- </resource>
- <resource>
- <directory>../src/conf</directory>
- <includes>
- <include>driver.classes.default.props</include>
- </includes>
- </resource>
- </resources>
- <plugins>
- 
- <plugin>
- <artifactId>maven-antrun-plugin</artifactId>
- <version>1.4</version>
- <executions>
- <execution>
- <id>copy</id>
- <phase>package</phase>
- <configuration>
- <tasks>
- <copy file="target/mahout-hdfs-${version}.jar" tofile="../mahout-hdfs-${version}.jar" />
- </tasks>
- </configuration>
- <goals>
- <goal>run</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- 
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <artifactId>maven-javadoc-plugin</artifactId>
- </plugin>
-
- <plugin>
- <artifactId>maven-source-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-remote-resources-plugin</artifactId>
- <configuration>
- <appendedResourcesDirectory>../src/main/appended-resources</appendedResourcesDirectory>
- <resourceBundles>
- <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
- </resourceBundles>
- <supplementalModels>
- <supplementalModel>supplemental-models.xml</supplementalModel>
- </supplementalModels>
- </configuration>
- </plugin>
- 
- <plugin>
- <artifactId>maven-clean-plugin</artifactId>
- <version>3.0.0</version>
- <configuration>
- <filesets>
- <fileset>
- <directory>../</directory>
- <includes>
- <include>mahout-hdfs*.jar</include>
- </includes>
- <followSymlinks>false</followSymlinks>
- </fileset>
- </filesets>
- </configuration>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
-
- 
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
-
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-math</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
-
- 
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-core</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jcl</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-lang3</artifactId>
- </dependency>
-
- <dependency>
- <groupId>commons-cli</groupId>
- <artifactId>commons-cli</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.thoughtworks.xstream</groupId>
- <artifactId>xstream</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-core</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.mahout.commons</groupId>
- <artifactId>commons-cli</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-math3</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>com.carrotsearch.randomizedtesting</groupId>
- <artifactId>randomizedtesting-runner</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.easymock</groupId>
- <artifactId>easymock</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.mrunit</groupId>
- <artifactId>mrunit</artifactId>
- <version>1.0.0</version>
- <classifier>${hadoop.classifier}</classifier>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <version>3.0.1</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-commons-csv</artifactId>
- <version>3.5.0</version>
- </dependency>
-
- </dependencies>
-
-</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/main/java/org/apache/mahout/common/IOUtils.java
----------------------------------------------------------------------
diff --git a/hdfs/src/main/java/org/apache/mahout/common/IOUtils.java b/hdfs/src/main/java/org/apache/mahout/common/IOUtils.java
deleted file mode 100644
index 0372ed4..0000000
--- a/hdfs/src/main/java/org/apache/mahout/common/IOUtils.java
+++ /dev/null
@@ -1,194 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.common;
-
-import java.io.Closeable;
-import java.io.File;
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.Collection;
-
-import org.apache.hadoop.mapred.lib.MultipleOutputs;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * 
- * I/O-related utility methods that don't have a better home.
- * 
- */
-public final class IOUtils {
-
- private static final Logger log = LoggerFactory.getLogger(IOUtils.class);
-
- private IOUtils() { }
-
- // Sheez, why can't ResultSet, Statement and Connection implement Closeable?
-
- public static void quietClose(ResultSet closeable) {
- if (closeable != null) {
- try {
- closeable.close();
- } catch (SQLException sqle) {
- log.warn("Unexpected exception while closing; continuing", sqle);
- }
- }
- }
-
- public static void quietClose(Statement closeable) {
- if (closeable != null) {
- try {
- closeable.close();
- } catch (SQLException sqle) {
- log.warn("Unexpected exception while closing; continuing", sqle);
- }
- }
- }
-
- public static void quietClose(Connection closeable) {
- if (closeable != null) {
- try {
- closeable.close();
- } catch (SQLException sqle) {
- log.warn("Unexpected exception while closing; continuing", sqle);
- }
- }
- }
-
- /**
- * Closes a {@link ResultSet}, {@link Statement} and {@link Connection} (if not null) and logs (but does not
- * rethrow) any resulting {@link SQLException}. This is useful for cleaning up after a database query.
- *
- * @param resultSet
- * {@link ResultSet} to close
- * @param statement
- * {@link Statement} to close
- * @param connection
- * {@link Connection} to close
- */
- public static void quietClose(ResultSet resultSet, Statement statement, Connection connection) {
- quietClose(resultSet);
- quietClose(statement);
- quietClose(connection);
- }
-
- /**
- * make sure to close all sources, log all of the problems occurred, clear
- * {@code closeables} (to prevent repeating close attempts), re-throw the
- * last one at the end. Helps resource scope management (e.g. compositions of
- * {@link Closeable}s objects)
- * 
- * 
- * Typical pattern:
- * 
- *
- * <pre>
- * LinkedList<Closeable> closeables = new LinkedList<Closeable>();
- * try {
- * InputStream stream1 = new FileInputStream(...);
- * closeables.addFirst(stream1);
- * ...
- * InputStream streamN = new FileInputStream(...);
- * closeables.addFirst(streamN);
- * ...
- * } finally {
- * IOUtils.close(closeables);
- * }
- * </pre>
- *
- * @param closeables
- * must be a modifiable collection of {@link Closeable}s
- * @throws IOException
- * the last exception (if any) of all closed resources
- */
- public static void close(Collection<? extends Closeable> closeables)
- throws IOException {
- Throwable lastThr = null;
-
- for (Closeable closeable : closeables) {
- try {
- closeable.close();
- } catch (Throwable thr) {
- log.error(thr.getMessage(), thr);
- lastThr = thr;
- }
- }
-
- // make sure we don't double-close
- // but that has to be modifiable collection
- closeables.clear();
-
- if (lastThr != null) {
- if (lastThr instanceof IOException) {
- throw (IOException) lastThr;
- } else if (lastThr instanceof RuntimeException) {
- throw (RuntimeException) lastThr;
- } else {
- throw (Error) lastThr;
- }
- }
-
- }
-
-
- /**
- * for temporary files, a file may be considered as a {@link Closeable} too,
- * where file is wiped on close and thus the disk resource is released
- * ('closed').
- *
- *
- */
- public static class DeleteFileOnClose implements Closeable {
-
- private final File file;
-
- public DeleteFileOnClose(File file) {
- this.file = file;
- }
-
- @Override
- public void close() throws IOException {
- if (file.isFile()) {
- file.delete();
- }
- }
- }
-
- /**
- * MultipleOutputs to closeable adapter.
- *
- */
- public static class MultipleOutputsCloseableAdapter implements Closeable {
- private final MultipleOutputs mo;
-
- public MultipleOutputsCloseableAdapter(MultipleOutputs mo) {
- this.mo = mo;
- }
-
- @Override
- public void close() throws IOException {
- if (mo != null) {
- mo.close();
- }
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java
----------------------------------------------------------------------
diff --git a/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java b/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java
deleted file mode 100644
index b8fc461..0000000
--- a/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java
+++ /dev/null
@@ -1,202 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import com.google.common.base.Preconditions;
-import org.apache.hadoop.io.Writable;
-import org.apache.mahout.math.list.IntArrayList;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-public class MatrixWritable implements Writable {
-
- private static final int FLAG_DENSE = 0x01;
- private static final int FLAG_SEQUENTIAL = 0x02;
- private static final int FLAG_LABELS = 0x04;
- private static final int FLAG_SPARSE_ROW = 0x08;
- private static final int NUM_FLAGS = 4;
-
- private Matrix matrix;
-
- public MatrixWritable() {}
-
- public MatrixWritable(Matrix m) {
- this.matrix = m;
- }
-
- public Matrix get() {
- return matrix;
- }
-
- public void set(Matrix matrix) {
- this.matrix = matrix;
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- writeMatrix(out, matrix);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- matrix = readMatrix(in);
- }
-
- public static void readLabels(DataInput in,
- Map<String, Integer> columnLabelBindings,
- Map<String, Integer> rowLabelBindings) throws IOException {
- int colSize = in.readInt();
- if (colSize > 0) {
- for (int i = 0; i < colSize; i++) {
- columnLabelBindings.put(in.readUTF(), in.readInt());
- }
- }
- int rowSize = in.readInt();
- if (rowSize > 0) {
- for (int i = 0; i < rowSize; i++) {
- rowLabelBindings.put(in.readUTF(), in.readInt());
- }
- }
- }
-
- public static void writeLabelBindings(DataOutput out,
- Map<String, Integer> columnLabelBindings,
- Map<String, Integer> rowLabelBindings) throws IOException {
- if (columnLabelBindings == null) {
- out.writeInt(0);
- } else {
- out.writeInt(columnLabelBindings.size());
- for (Map.Entry<String, Integer> stringIntegerEntry : columnLabelBindings.entrySet()) {
- out.writeUTF(stringIntegerEntry.getKey());
- out.writeInt(stringIntegerEntry.getValue());
- }
- }
- if (rowLabelBindings == null) {
- out.writeInt(0);
- } else {
- out.writeInt(rowLabelBindings.size());
- for (Map.Entry<String, Integer> stringIntegerEntry : rowLabelBindings.entrySet()) {
- out.writeUTF(stringIntegerEntry.getKey());
- out.writeInt(stringIntegerEntry.getValue());
- }
- }
- }
-
- /** Reads a typed Matrix instance from the input stream */
- public static Matrix readMatrix(DataInput in) throws IOException {
- int flags = in.readInt();
- Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2));
- boolean dense = (flags & FLAG_DENSE) != 0;
- boolean sequential = (flags & FLAG_SEQUENTIAL) != 0;
- boolean hasLabels = (flags & FLAG_LABELS) != 0;
- boolean isSparseRowMatrix = (flags & FLAG_SPARSE_ROW) != 0;
-
- int rows = in.readInt();
- int columns = in.readInt();
-
- byte vectorFlags = in.readByte();
-
- Matrix matrix;
-
- if (dense) {
- matrix = new DenseMatrix(rows, columns);
- for (int row = 0; row < rows; row++) {
- matrix.assignRow(row, VectorWritable.readVector(in, vectorFlags, columns));
- }
- } else if (isSparseRowMatrix) {
- Vector[] rowVectors = new Vector[rows];
- for (int row = 0; row < rows; row++) {
- rowVectors[row] = VectorWritable.readVector(in, vectorFlags, columns);
- }
- matrix = new SparseRowMatrix(rows, columns, rowVectors, true, !sequential);
- } else {
- matrix = new SparseMatrix(rows, columns);
- int numNonZeroRows = in.readInt();
- int rowsRead = 0;
- while (rowsRead++ < numNonZeroRows) {
- int rowIndex = in.readInt();
- matrix.assignRow(rowIndex, VectorWritable.readVector(in, vectorFlags, columns));
- }
- }
-
- if (hasLabels) {
- Map<String,Integer> columnLabelBindings = new HashMap<>();
- Map<String,Integer> rowLabelBindings = new HashMap<>();
- readLabels(in, columnLabelBindings, rowLabelBindings);
- if (!columnLabelBindings.isEmpty()) {
- matrix.setColumnLabelBindings(columnLabelBindings);
- }
- if (!rowLabelBindings.isEmpty()) {
- matrix.setRowLabelBindings(rowLabelBindings);
- }
- }
-
- return matrix;
- }
-
- /** Writes a typed Matrix instance to the output stream */
- public static void writeMatrix(final DataOutput out, Matrix matrix) throws IOException {
- int flags = 0;
- Vector row = matrix.viewRow(0);
- boolean isDense = row.isDense();
- if (isDense) {
- flags |= FLAG_DENSE;
- }
- if (row.isSequentialAccess()) {
- flags |= FLAG_SEQUENTIAL;
- }
- if (matrix.getRowLabelBindings() != null || matrix.getColumnLabelBindings() != null) {
- flags |= FLAG_LABELS;
- }
- boolean isSparseRowMatrix = matrix instanceof SparseRowMatrix;
- if (isSparseRowMatrix) {
- flags |= FLAG_SPARSE_ROW;
- }
-
- out.writeInt(flags);
- out.writeInt(matrix.rowSize());
- out.writeInt(matrix.columnSize());
-
- // We only use vectors of the same type, so we write out the type information only once!
- byte vectorFlags = VectorWritable.flags(matrix.viewRow(0), false);
- out.writeByte(vectorFlags);
-
- if (isDense || isSparseRowMatrix) {
- for (int i = 0; i < matrix.rowSize(); i++) {
- VectorWritable.writeVectorContents(out, matrix.viewRow(i), vectorFlags);
- }
- } else {
- IntArrayList rowIndices = ((SparseMatrix) matrix).nonZeroRowIndices();
- int numNonZeroRows = rowIndices.size();
- out.writeInt(numNonZeroRows);
- for (int i = 0; i < numNonZeroRows; i++) {
- int rowIndex = rowIndices.getQuick(i);
- out.writeInt(rowIndex);
- VectorWritable.writeVectorContents(out, matrix.viewRow(rowIndex), vectorFlags);
- }
- }
-
- if ((flags & FLAG_LABELS) != 0) {
- writeLabelBindings(out, matrix.getColumnLabelBindings(), matrix.getRowLabelBindings());
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java
----------------------------------------------------------------------
diff --git a/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java b/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java
deleted file mode 100644
index e5cb173..0000000
--- a/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.hadoop.io.WritableComparable;
-
-public class VarIntWritable implements WritableComparable<VarIntWritable>, Cloneable {
-
- private int value;
-
- public VarIntWritable() {
- }
-
- public VarIntWritable(int value) {
- this.value = value;
- }
-
- public int get() {
- return value;
- }
-
- public void set(int value) {
- this.value = value;
- }
-
- @Override
- public boolean equals(Object other) {
- return other instanceof VarIntWritable && ((VarIntWritable) other).value == value;
- }
-
- @Override
- public int hashCode() {
- return value;
- }
-
- @Override
- public String toString() {
- return String.valueOf(value);
- }
-
- @Override
- public VarIntWritable clone() {
- return new VarIntWritable(value);
- }
-
- @Override
- public int compareTo(VarIntWritable other) {
- if (value < other.value) {
- return -1;
- }
- if (value > other.value) {
- return 1;
- }
- return 0;
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- Varint.writeSignedVarInt(value, out);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- value = Varint.readSignedVarInt(in);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java
----------------------------------------------------------------------
diff --git a/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java b/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java
deleted file mode 100644
index 7b0d9c4..0000000
--- a/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import com.google.common.primitives.Longs;
-import org.apache.hadoop.io.WritableComparable;
-
-public class VarLongWritable implements WritableComparable<VarLongWritable> {
-
- private long value;
-
- public VarLongWritable() {
- }
-
- public VarLongWritable(long value) {
- this.value = value;
- }
-
- public long get() {
- return value;
- }
-
- public void set(long value) {
- this.value = value;
- }
-
- @Override
- public boolean equals(Object other) {
- return other != null && getClass().equals(other.getClass()) && ((VarLongWritable) other).value == value;
- }
-
- @Override
- public int hashCode() {
- return Longs.hashCode(value);
- }
-
- @Override
- public String toString() {
- return String.valueOf(value);
- }
-
- @Override
- public int compareTo(VarLongWritable other) {
- if (value >= other.value) {
- if (value > other.value) {
- return 1;
- }
- } else {
- return -1;
- }
- return 0;
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- Varint.writeSignedVarLong(value, out);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- value = Varint.readSignedVarLong(in);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/main/java/org/apache/mahout/math/Varint.java
----------------------------------------------------------------------
diff --git a/hdfs/src/main/java/org/apache/mahout/math/Varint.java b/hdfs/src/main/java/org/apache/mahout/math/Varint.java
deleted file mode 100644
index f380c6c..0000000
--- a/hdfs/src/main/java/org/apache/mahout/math/Varint.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import com.google.common.base.Preconditions;
-
-/**
- * Encodes signed and unsigned values using a common variable-length
- * scheme, found for example in
- * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
- * Google's Protocol Buffers</a>. It uses fewer bytes to encode smaller values,
- * but will use slightly more bytes to encode large values.
- *
- * Signed values are further encoded using so-called zig-zag encoding
- * in order to make them "compatible" with variable-length encoding.
- */
-public final class Varint {
-
- private Varint() {
- }
-
- /**
- * Encodes a value using the variable-length encoding from
- * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
- * Google Protocol Buffers</a>. It uses zig-zag encoding to efficiently
- * encode signed values. If values are known to be nonnegative,
- * {@link #writeUnsignedVarLong(long, java.io.DataOutput)} should be used.
- *
- * @param value value to encode
- * @param out to write bytes to
- * @throws java.io.IOException if {@link java.io.DataOutput} throws {@link java.io.IOException}
- */
- public static void writeSignedVarLong(long value, DataOutput out) throws IOException {
- // Great trick from http://code.google.com/apis/protocolbuffers/docs/encoding.html#types
- writeUnsignedVarLong((value << 1) ^ (value >> 63), out);
- }
-
- /**
- * Encodes a value using the variable-length encoding from
- * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
- * Google Protocol Buffers</a>. Zig-zag is not used, so input must not be negative.
- * If values can be negative, use {@link #writeSignedVarLong(long, java.io.DataOutput)}
- * instead. This method treats negative input as like a large unsigned value.
- *
- * @param value value to encode
- * @param out to write bytes to
- * @throws java.io.IOException if {@link java.io.DataOutput} throws {@link java.io.IOException}
- */
- public static void writeUnsignedVarLong(long value, DataOutput out) throws IOException {
- while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) {
- out.writeByte(((int) value & 0x7F) | 0x80);
- value >>>= 7;
- }
- out.writeByte((int) value & 0x7F);
- }
-
- /**
- * @see #writeSignedVarLong(long, java.io.DataOutput)
- */
- public static void writeSignedVarInt(int value, DataOutput out) throws IOException {
- // Great trick from http://code.google.com/apis/protocolbuffers/docs/encoding.html#types
- writeUnsignedVarInt((value << 1) ^ (value >> 31), out);
- }
-
- /**
- * @see #writeUnsignedVarLong(long, java.io.DataOutput)
- */
- public static void writeUnsignedVarInt(int value, DataOutput out) throws IOException {
- while ((value & 0xFFFFFF80) != 0L) {
- out.writeByte((value & 0x7F) | 0x80);
- value >>>= 7;
- }
- out.writeByte(value & 0x7F);
- }
-
- /**
- * @param in to read bytes from
- * @return decode value
- * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException}
- * @throws IllegalArgumentException if variable-length value does not terminate
- * after 9 bytes have been read
- * @see #writeSignedVarLong(long, java.io.DataOutput)
- */
- public static long readSignedVarLong(DataInput in) throws IOException {
- long raw = readUnsignedVarLong(in);
- // This undoes the trick in writeSignedVarLong()
- long temp = (((raw << 63) >> 63) ^ raw) >> 1;
- // This extra step lets us deal with the largest signed values by treating
- // negative results from read unsigned methods as like unsigned values
- // Must re-flip the top bit if the original read value had it set.
- return temp ^ (raw & (1L << 63));
- }
-
- /**
- * @param in to read bytes from
- * @return decode value
- * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException}
- * @throws IllegalArgumentException if variable-length value does not terminate
- * after 9 bytes have been read
- * @see #writeUnsignedVarLong(long, java.io.DataOutput)
- */
- public static long readUnsignedVarLong(DataInput in) throws IOException {
- long value = 0L;
- int i = 0;
- long b;
- while (((b = in.readByte()) & 0x80L) != 0) {
- value |= (b & 0x7F) << i;
- i += 7;
- Preconditions.checkArgument(i <= 63, "Variable length quantity is too long (must be <= 63)");
- }
- return value | (b << i);
- }
-
- /**
- * @throws IllegalArgumentException if variable-length value does not terminate
- * after 5 bytes have been read
- * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException}
- * @see #readSignedVarLong(java.io.DataInput)
- */
- public static int readSignedVarInt(DataInput in) throws IOException {
- int raw = readUnsignedVarInt(in);
- // This undoes the trick in writeSignedVarInt()
- int temp = (((raw << 31) >> 31) ^ raw) >> 1;
- // This extra step lets us deal with the largest signed values by treating
- // negative results from read unsigned methods as like unsigned values.
- // Must re-flip the top bit if the original read value had it set.
- return temp ^ (raw & (1 << 31));
- }
-
- /**
- * @throws IllegalArgumentException if variable-length value does not terminate
- * after 5 bytes have been read
- * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException}
- * @see #readUnsignedVarLong(java.io.DataInput)
- */
- public static int readUnsignedVarInt(DataInput in) throws IOException {
- int value = 0;
- int i = 0;
- int b;
- while (((b = in.readByte()) & 0x80) != 0) {
- value |= (b & 0x7F) << i;
- i += 7;
- Preconditions.checkArgument(i <= 35, "Variable length quantity is too long (must be <= 35)");
- }
- return value | (b << i);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java
----------------------------------------------------------------------
diff --git a/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java b/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java
deleted file mode 100644
index 491ae3b..0000000
--- a/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java
+++ /dev/null
@@ -1,267 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.Writable;
-import org.apache.mahout.math.Vector.Element;
-
-import com.google.common.base.Preconditions;
-
-public final class VectorWritable extends Configured implements Writable {
-
- public static final int FLAG_DENSE = 0x01;
- public static final int FLAG_SEQUENTIAL = 0x02;
- public static final int FLAG_NAMED = 0x04;
- public static final int FLAG_LAX_PRECISION = 0x08;
- public static final int NUM_FLAGS = 4;
-
- private Vector vector;
- private boolean writesLaxPrecision;
-
- public VectorWritable() {}
-
- public VectorWritable(boolean writesLaxPrecision) {
- setWritesLaxPrecision(writesLaxPrecision);
- }
-
- public VectorWritable(Vector vector) {
- this.vector = vector;
- }
-
- public VectorWritable(Vector vector, boolean writesLaxPrecision) {
- this(vector);
- setWritesLaxPrecision(writesLaxPrecision);
- }
-
- /**
- * @return {@link org.apache.mahout.math.Vector} that this is to write, or has
- * just read
- */
- public Vector get() {
- return vector;
- }
-
- public void set(Vector vector) {
- this.vector = vector;
- }
-
- /**
- * @return true if this is allowed to encode {@link org.apache.mahout.math.Vector}
- * values using fewer bytes, possibly losing precision. In particular this means
- * that floating point values will be encoded as floats, not doubles.
- */
- public boolean isWritesLaxPrecision() {
- return writesLaxPrecision;
- }
-
- public void setWritesLaxPrecision(boolean writesLaxPrecision) {
- this.writesLaxPrecision = writesLaxPrecision;
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- writeVector(out, this.vector, this.writesLaxPrecision);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- int flags = in.readByte();
- int size = Varint.readUnsignedVarInt(in);
- readFields(in, (byte) flags, size);
- }
-
- private void readFields(DataInput in, byte flags, int size) throws IOException {
-
- Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2));
- boolean dense = (flags & FLAG_DENSE) != 0;
- boolean sequential = (flags & FLAG_SEQUENTIAL) != 0;
- boolean named = (flags & FLAG_NAMED) != 0;
- boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0;
-
- Vector v;
- if (dense) {
- double[] values = new double[size];
- for (int i = 0; i < size; i++) {
- values[i] = laxPrecision ? in.readFloat() : in.readDouble();
- }
- v = new DenseVector(values);
- } else {
- int numNonDefaultElements = Varint.readUnsignedVarInt(in);
- v = sequential
- ? new SequentialAccessSparseVector(size, numNonDefaultElements)
- : new RandomAccessSparseVector(size, numNonDefaultElements);
- if (sequential) {
- int lastIndex = 0;
- for (int i = 0; i < numNonDefaultElements; i++) {
- int delta = Varint.readUnsignedVarInt(in);
- int index = lastIndex + delta;
- lastIndex = index;
- double value = laxPrecision ? in.readFloat() : in.readDouble();
- v.setQuick(index, value);
- }
- } else {
- for (int i = 0; i < numNonDefaultElements; i++) {
- int index = Varint.readUnsignedVarInt(in);
- double value = laxPrecision ? in.readFloat() : in.readDouble();
- v.setQuick(index, value);
- }
- }
- }
- if (named) {
- String name = in.readUTF();
- v = new NamedVector(v, name);
- }
- vector = v;
- }
-
- /** Write the vector to the output */
- public static void writeVector(DataOutput out, Vector vector) throws IOException {
- writeVector(out, vector, false);
- }
-
- public static byte flags(Vector vector, boolean laxPrecision) {
- boolean dense = vector.isDense();
- boolean sequential = vector.isSequentialAccess();
- boolean named = vector instanceof NamedVector;
-
- return (byte) ((dense ? FLAG_DENSE : 0)
- | (sequential ? FLAG_SEQUENTIAL : 0)
- | (named ? FLAG_NAMED : 0)
- | (laxPrecision ? FLAG_LAX_PRECISION : 0));
- }
-
- /** Write out type information and size of the vector */
- public static void writeVectorFlagsAndSize(DataOutput out, byte flags, int size) throws IOException {
- out.writeByte(flags);
- Varint.writeUnsignedVarInt(size, out);
- }
-
- public static void writeVector(DataOutput out, Vector vector, boolean laxPrecision) throws IOException {
-
- byte flags = flags(vector, laxPrecision);
-
- writeVectorFlagsAndSize(out, flags, vector.size());
- writeVectorContents(out, vector, flags);
- }
-
- /** Write out contents of the vector */
- public static void writeVectorContents(DataOutput out, Vector vector, byte flags) throws IOException {
-
- boolean dense = (flags & FLAG_DENSE) != 0;
- boolean sequential = (flags & FLAG_SEQUENTIAL) != 0;
- boolean named = (flags & FLAG_NAMED) != 0;
- boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0;
-
- if (dense) {
- for (Element element : vector.all()) {
- if (laxPrecision) {
- out.writeFloat((float) element.get());
- } else {
- out.writeDouble(element.get());
- }
- }
- } else {
- Varint.writeUnsignedVarInt(vector.getNumNonZeroElements(), out);
- Iterator<Element> iter = vector.nonZeroes().iterator();
- if (sequential) {
- int lastIndex = 0;
- while (iter.hasNext()) {
- Element element = iter.next();
- if (element.get() == 0) {
- continue;
- }
- int thisIndex = element.index();
- // Delta-code indices:
- Varint.writeUnsignedVarInt(thisIndex - lastIndex, out);
- lastIndex = thisIndex;
- if (laxPrecision) {
- out.writeFloat((float) element.get());
- } else {
- out.writeDouble(element.get());
- }
- }
- } else {
- while (iter.hasNext()) {
- Element element = iter.next();
- if (element.get() == 0) {
- // TODO(robinanil): Fix the damn iterator for the zero element.
- continue;
- }
- Varint.writeUnsignedVarInt(element.index(), out);
- if (laxPrecision) {
- out.writeFloat((float) element.get());
- } else {
- out.writeDouble(element.get());
- }
- }
- }
- }
- if (named) {
- String name = ((NamedVector) vector).getName();
- out.writeUTF(name == null ? "" : name);
- }
- }
-
- public static Vector readVector(DataInput in) throws IOException {
- VectorWritable v = new VectorWritable();
- v.readFields(in);
- return v.get();
- }
-
- public static Vector readVector(DataInput in, byte vectorFlags, int size) throws IOException {
- VectorWritable v = new VectorWritable();
- v.readFields(in, vectorFlags, size);
- return v.get();
- }
-
- public static VectorWritable merge(Iterator<VectorWritable> vectors) {
- return new VectorWritable(mergeToVector(vectors));
- }
-
- public static Vector mergeToVector(Iterator<VectorWritable> vectors) {
- Vector accumulator = vectors.next().get();
- while (vectors.hasNext()) {
- VectorWritable v = vectors.next();
- if (v != null) {
- for (Element nonZeroElement : v.get().nonZeroes()) {
- accumulator.setQuick(nonZeroElement.index(), nonZeroElement.get());
- }
- }
- }
- return accumulator;
- }
-
- @Override
- public boolean equals(Object o) {
- return o instanceof VectorWritable && vector.equals(((VectorWritable) o).get());
- }
-
- @Override
- public int hashCode() {
- return vector.hashCode();
- }
-
- @Override
- public String toString() {
- return vector.toString();
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java
----------------------------------------------------------------------
diff --git a/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java b/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java
deleted file mode 100644
index 31e6947..0000000
--- a/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.hadoop.io.Writable;
-import org.junit.Test;
-
-public final class MatrixWritableTest extends MahoutTestCase {
-
- @Test
- public void testSparseMatrixWritable() throws Exception {
- Matrix m = new SparseMatrix(5, 5);
- m.set(1, 2, 3.0);
- m.set(3, 4, 5.0);
- Map<String, Integer> bindings = new HashMap<>();
- bindings.put("A", 0);
- bindings.put("B", 1);
- bindings.put("C", 2);
- bindings.put("D", 3);
- bindings.put("default", 4);
- m.setRowLabelBindings(bindings);
- m.setColumnLabelBindings(bindings);
- doTestMatrixWritableEquals(m);
- }
-
- @Test
- public void testSparseRowMatrixWritable() throws Exception {
- Matrix m = new SparseRowMatrix(5, 5);
- m.set(1, 2, 3.0);
- m.set(3, 4, 5.0);
- Map<String, Integer> bindings = new HashMap<>();
- bindings.put("A", 0);
- bindings.put("B", 1);
- bindings.put("C", 2);
- bindings.put("D", 3);
- bindings.put("default", 4);
- m.setRowLabelBindings(bindings);
- m.setColumnLabelBindings(bindings);
- doTestMatrixWritableEquals(m);
- }
-
- @Test
- public void testDenseMatrixWritable() throws Exception {
- Matrix m = new DenseMatrix(5,5);
- m.set(1, 2, 3.0);
- m.set(3, 4, 5.0);
- Map<String, Integer> bindings = new HashMap<>();
- bindings.put("A", 0);
- bindings.put("B", 1);
- bindings.put("C", 2);
- bindings.put("D", 3);
- bindings.put("default", 4);
- m.setRowLabelBindings(bindings);
- m.setColumnLabelBindings(bindings);
- doTestMatrixWritableEquals(m);
- }
-
- private static void doTestMatrixWritableEquals(Matrix m) throws IOException {
- Writable matrixWritable = new MatrixWritable(m);
- MatrixWritable matrixWritable2 = new MatrixWritable();
- writeAndRead(matrixWritable, matrixWritable2);
- Matrix m2 = matrixWritable2.get();
- compareMatrices(m, m2);
- doCheckBindings(m2.getRowLabelBindings());
- doCheckBindings(m2.getColumnLabelBindings());
- }
-
- private static void compareMatrices(Matrix m, Matrix m2) {
- assertEquals(m.numRows(), m2.numRows());
- assertEquals(m.numCols(), m2.numCols());
- for (int r = 0; r < m.numRows(); r++) {
- for (int c = 0; c < m.numCols(); c++) {
- assertEquals(m.get(r, c), m2.get(r, c), EPSILON);
- }
- }
- Map<String,Integer> bindings = m.getRowLabelBindings();
- Map<String, Integer> bindings2 = m2.getRowLabelBindings();
- assertEquals(bindings == null, bindings2 == null);
- if (bindings != null) {
- assertEquals(bindings.size(), m.numRows());
- assertEquals(bindings.size(), bindings2.size());
- for (Map.Entry<String,Integer> entry : bindings.entrySet()) {
- assertEquals(entry.getValue(), bindings2.get(entry.getKey()));
- }
- }
- bindings = m.getColumnLabelBindings();
- bindings2 = m2.getColumnLabelBindings();
- assertEquals(bindings == null, bindings2 == null);
- if (bindings != null) {
- assertEquals(bindings.size(), bindings2.size());
- for (Map.Entry<String,Integer> entry : bindings.entrySet()) {
- assertEquals(entry.getValue(), bindings2.get(entry.getKey()));
- }
- }
- }
-
- private static void doCheckBindings(Map<String,Integer> labels) {
- assertTrue("Missing label", labels.keySet().contains("A"));
- assertTrue("Missing label", labels.keySet().contains("B"));
- assertTrue("Missing label", labels.keySet().contains("C"));
- assertTrue("Missing label", labels.keySet().contains("D"));
- assertTrue("Missing label", labels.keySet().contains("default"));
- }
-
- private static void writeAndRead(Writable toWrite, Writable toRead) throws IOException {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- try (DataOutputStream dos = new DataOutputStream(baos)){
- toWrite.write(dos);
- }
-
- ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
- try (DataInputStream dis = new DataInputStream(bais)) {
- toRead.readFields(dis);
- }
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java
----------------------------------------------------------------------
diff --git a/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java b/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java
deleted file mode 100644
index 0b1a664..0000000
--- a/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java
+++ /dev/null
@@ -1,189 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math;
-
-import org.junit.Test;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-
-/**
- * Tests {@link Varint}.
- */
-public final class VarintTest extends MahoutTestCase {
-
- @Test
- public void testUnsignedLong() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- Varint.writeUnsignedVarLong(0L, out);
- for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) {
- Varint.writeUnsignedVarLong(i-1, out);
- Varint.writeUnsignedVarLong(i, out);
- }
- Varint.writeUnsignedVarLong(Long.MAX_VALUE, out);
-
- DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
- assertEquals(0L, Varint.readUnsignedVarLong(in));
- for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) {
- assertEquals(i-1, Varint.readUnsignedVarLong(in));
- assertEquals(i, Varint.readUnsignedVarLong(in));
- }
- assertEquals(Long.MAX_VALUE, Varint.readUnsignedVarLong(in));
- }
-
- @Test
- public void testSignedPositiveLong() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- Varint.writeSignedVarLong(0L, out);
- for (long i = 1L; i <= (1L << 61); i <<= 1) {
- Varint.writeSignedVarLong(i-1, out);
- Varint.writeSignedVarLong(i, out);
- }
- Varint.writeSignedVarLong((1L << 62) - 1, out);
- Varint.writeSignedVarLong((1L << 62), out);
- Varint.writeSignedVarLong(Long.MAX_VALUE, out);
-
- DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
- assertEquals(0L, Varint.readSignedVarLong(in));
- for (long i = 1L; i <= (1L << 61); i <<= 1) {
- assertEquals(i-1, Varint.readSignedVarLong(in));
- assertEquals(i, Varint.readSignedVarLong(in));
- }
- assertEquals((1L << 62) - 1, Varint.readSignedVarLong(in));
- assertEquals((1L << 62), Varint.readSignedVarLong(in));
- assertEquals(Long.MAX_VALUE, Varint.readSignedVarLong(in));
- }
-
- @Test
- public void testSignedNegativeLong() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- for (long i = -1L; i >= -(1L << 62); i <<= 1) {
- Varint.writeSignedVarLong(i, out);
- Varint.writeSignedVarLong(i+1, out);
- }
- Varint.writeSignedVarLong(Long.MIN_VALUE, out);
- Varint.writeSignedVarLong(Long.MIN_VALUE+1, out);
- DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
- for (long i = -1L; i >= -(1L << 62); i <<= 1) {
- assertEquals(i, Varint.readSignedVarLong(in));
- assertEquals(i+1, Varint.readSignedVarLong(in));
- }
- assertEquals(Long.MIN_VALUE, Varint.readSignedVarLong(in));
- assertEquals(Long.MIN_VALUE+1, Varint.readSignedVarLong(in));
- }
-
- @Test
- public void testUnsignedInt() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- Varint.writeUnsignedVarInt(0, out);
- for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) {
- Varint.writeUnsignedVarLong(i-1, out);
- Varint.writeUnsignedVarLong(i, out);
- }
- Varint.writeUnsignedVarLong(Integer.MAX_VALUE, out);
-
- DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
- assertEquals(0, Varint.readUnsignedVarInt(in));
- for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) {
- assertEquals(i-1, Varint.readUnsignedVarInt(in));
- assertEquals(i, Varint.readUnsignedVarInt(in));
- }
- assertEquals(Integer.MAX_VALUE, Varint.readUnsignedVarInt(in));
- }
-
- @Test
- public void testSignedPositiveInt() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- Varint.writeSignedVarInt(0, out);
- for (int i = 1; i <= (1 << 29); i <<= 1) {
- Varint.writeSignedVarLong(i-1, out);
- Varint.writeSignedVarLong(i, out);
- }
- Varint.writeSignedVarInt((1 << 30) - 1, out);
- Varint.writeSignedVarInt((1 << 30), out);
- Varint.writeSignedVarInt(Integer.MAX_VALUE, out);
-
- DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
- assertEquals(0, Varint.readSignedVarInt(in));
- for (int i = 1; i <= (1 << 29); i <<= 1) {
- assertEquals(i-1, Varint.readSignedVarInt(in));
- assertEquals(i, Varint.readSignedVarInt(in));
- }
- assertEquals((1L << 30) - 1, Varint.readSignedVarInt(in));
- assertEquals((1L << 30), Varint.readSignedVarInt(in));
- assertEquals(Integer.MAX_VALUE, Varint.readSignedVarInt(in));
- }
-
- @Test
- public void testSignedNegativeInt() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- for (int i = -1; i >= -(1 << 30); i <<= 1) {
- Varint.writeSignedVarInt(i, out);
- Varint.writeSignedVarInt(i+1, out);
- }
- Varint.writeSignedVarInt(Integer.MIN_VALUE, out);
- Varint.writeSignedVarInt(Integer.MIN_VALUE+1, out);
- DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
- for (int i = -1; i >= -(1 << 30); i <<= 1) {
- assertEquals(i, Varint.readSignedVarInt(in));
- assertEquals(i+1, Varint.readSignedVarInt(in));
- }
- assertEquals(Integer.MIN_VALUE, Varint.readSignedVarInt(in));
- assertEquals(Integer.MIN_VALUE+1, Varint.readSignedVarInt(in));
- }
-
- @Test
- public void testUnsignedSize() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- int expectedSize = 0;
- for (int exponent = 0; exponent <= 62; exponent++) {
- Varint.writeUnsignedVarLong(1L << exponent, out);
- expectedSize += 1 + exponent / 7;
- assertEquals(expectedSize, baos.size());
- }
- }
-
- @Test
- public void testSignedSize() throws Exception {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(baos);
- int expectedSize = 0;
- for (int exponent = 0; exponent <= 61; exponent++) {
- Varint.writeSignedVarLong(1L << exponent, out);
- expectedSize += 1 + ((exponent + 1) / 7);
- assertEquals(expectedSize, baos.size());
- }
- for (int exponent = 0; exponent <= 61; exponent++) {
- Varint.writeSignedVarLong(-(1L << exponent)-1, out);
- expectedSize += 1 + ((exponent + 1) / 7);
- assertEquals(expectedSize, baos.size());
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java
----------------------------------------------------------------------
diff --git a/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java b/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java
deleted file mode 100644
index 991be6e..0000000
--- a/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package org.apache.mahout.math;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-
-import org.apache.hadoop.io.Writable;
-import org.apache.mahout.math.Vector.Element;
-import org.junit.Test;
-
-import com.carrotsearch.randomizedtesting.RandomizedTest;
-import com.carrotsearch.randomizedtesting.annotations.Repeat;
-
-public final class VectorWritableTest extends RandomizedTest {
- private static final int MAX_VECTOR_SIZE = 100;
-
- public void createRandom(Vector v) {
- int size = randomInt(v.size() - 1);
- for (int i = 0; i < size; ++i) {
- v.set(randomInt(v.size() - 1), randomDouble());
- }
-
- int zeros = Math.max(2, size / 4);
- for (Element e : v.nonZeroes()) {
- if (e.index() % zeros == 0) {
- e.set(0.0);
- }
- }
- }
-
- @Test
- @Repeat(iterations = 20)
- public void testViewSequentialAccessSparseVectorWritable() throws Exception {
- Vector v = new SequentialAccessSparseVector(MAX_VECTOR_SIZE);
- createRandom(v);
- Vector view = new VectorView(v, 0, v.size());
- doTestVectorWritableEquals(view);
- }
-
- @Test
- @Repeat(iterations = 20)
- public void testSequentialAccessSparseVectorWritable() throws Exception {
- Vector v = new SequentialAccessSparseVector(MAX_VECTOR_SIZE);
- createRandom(v);
- doTestVectorWritableEquals(v);
- }
-
- @Test
- @Repeat(iterations = 20)
- public void testRandomAccessSparseVectorWritable() throws Exception {
- Vector v = new RandomAccessSparseVector(MAX_VECTOR_SIZE);
- createRandom(v);
- doTestVectorWritableEquals(v);
- }
-
- @Test
- @Repeat(iterations = 20)
- public void testDenseVectorWritable() throws Exception {
- Vector v = new DenseVector(MAX_VECTOR_SIZE);
- createRandom(v);
- doTestVectorWritableEquals(v);
- }
-
- @Test
- @Repeat(iterations = 20)
- public void testNamedVectorWritable() throws Exception {
- Vector v = new DenseVector(MAX_VECTOR_SIZE);
- v = new NamedVector(v, "Victor");
- createRandom(v);
- doTestVectorWritableEquals(v);
- }
-
- private static void doTestVectorWritableEquals(Vector v) throws IOException {
- Writable vectorWritable = new VectorWritable(v);
- VectorWritable vectorWritable2 = new VectorWritable();
- writeAndRead(vectorWritable, vectorWritable2);
- Vector v2 = vectorWritable2.get();
- if (v instanceof NamedVector) {
- assertTrue(v2 instanceof NamedVector);
- NamedVector nv = (NamedVector) v;
- NamedVector nv2 = (NamedVector) v2;
- assertEquals(nv.getName(), nv2.getName());
- assertEquals("Victor", nv.getName());
- }
- assertEquals(v, v2);
- }
-
- private static void writeAndRead(Writable toWrite, Writable toRead) throws IOException {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- try (DataOutputStream dos = new DataOutputStream(baos)){
- toWrite.write(dos);
- }
-
- ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
- try (DataInputStream dis = new DataInputStream(bais)) {
- toRead.readFields(dis);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/bin/prep_asf_mail_archives.sh
----------------------------------------------------------------------
diff --git a/integration/bin/prep_asf_mail_archives.sh b/integration/bin/prep_asf_mail_archives.sh
deleted file mode 100755
index 77f5d13..0000000
--- a/integration/bin/prep_asf_mail_archives.sh
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/bin/bash
-#
-# Performs the setup procedures for clustering the ASF mail archives
-# described in Taming Text.
-#
-# Required Command-line Parameters:
-#
-# $1 - Path to this script's working directory, you will need about
-# 22GB of free space to run this script.
-#
-# $2 - Path to where the ASF Public Archive data is, untarred.
-# If you are running Hadoop and the files are in HDFS, then
-# this will need to be an HDFS path. Default is $1/input
-# $3 - Path to where this script saves the SequenceFile output.
-# If you are running Hadoop and you want the sequence files
-# saved to your HDFS then you need to set this value to an
-# HDFS path and make sure you set HADOOP_HOME so Mahout can
-# find Hadoop. Default is $1/sequence-files
-#
-#
-# Required Environment Variables:
-#
-# MAHOUT_HOME
-# Root directory of your Mahout distribution
-#
-# HADOOP_HOME
-# Only needed if you want to send output to HDFS
-#
-# Example:
-# ./prep_asf_mail_archives.sh /mnt/asf-mail-archives /mnt/asf-archives/asf-mail-archives-7-18-2011 /mnt/asf-mail-archives/output
-#
-# This will download the TAR files from S3, extract them, and then
-# run the Mahout org.apache.mahout.text.SequenceFilesFromMailArchives job
-# to create Hadoop SequenceFiles in /mnt/asf-mail-archives/output
-#
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-if [ "$MAHOUT_HOME" = "" ]; then
- echo "Error: MAHOUT_HOME is not set."
- exit 1
-fi
-
-if [ "$1" = "" ]; then
- echo "Error: Please pass the path to your prep directory, such as /mnt/asf-mail-archives.\n\n\tUsage: $0 workingDir inputPath outputPath\n"
- exit 1
-fi
-
-# Location where this script saves files
-PREP_DIR=$1
-
-if [ "$2" != "" ]; then
- SEQFILE_INPUT_DIR=$2
-else
- SEQFILE_INPUT_DIR=$PREP_DIR/input
-fi
-
-
-# Change this to an HDFS path if you are running Hadoop
-if [ "$3" != "" ]; then
- SEQFILE_OUTPUT_DIR=$3
-else
- SEQFILE_OUTPUT_DIR=$PREP_DIR/sequence-files
-fi
-
-# If output sent to HDFS, clear MAHOUT_LOCAL and make sure HADOOP_HOME is set
-if [[ "$SEQFILE_OUTPUT_DIR" = hdfs://* ]]; then
- export MAHOUT_LOCAL=
- if [ "$HADOOP_HOME" = "" ]; then
- echo "Error: HADOOP_HOME must be set if you want to send output to HDFS."
- exit 1
- fi
-else
- export MAHOUT_LOCAL=$PREP_DIR
-fi
-
-echo "Running $0 with:
- PREP_DIR = $PREP_DIR
- SEQFILE_INPUT_DIR = $SEQFILE_INPUT_DIR
- SEQFILE_OUTPUT_DIR = $SEQFILE_OUTPUT_DIR
- MAHOUT_LOCAL = $MAHOUT_LOCAL
- HADOOP_HOME = $HADOOP_HOME"
-
-# Run Mahout in Local mode! Remove this if you want the
-# sequence files stored in your HDFS
-
-
-# convert the extracted gz files into Hadoop SequenceFiles
-echo "Converting extracted directories to SequenceFiles ..."
-$MAHOUT_HOME/bin/mahout org.apache.mahout.text.SequenceFilesFromMailArchives \
---input $SEQFILE_INPUT_DIR --output $SEQFILE_OUTPUT_DIR --subject --body \
--c UTF-8 -chunk 1024 -prefix asf_archives

r***@apache.org

2018-06-27 14:52:08 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
new file mode 100644
index 0000000..7483b2d
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+
+/**
+ * Used for testing lucene2seq
+ */
+@Deprecated
+public class MultipleFieldsDocument extends SingleFieldDocument {
+
+ public static final String FIELD1 = "field1";
+ public static final String FIELD2 = "field2";
+
+ private String field1;
+ private String field2;
+
+ public MultipleFieldsDocument(String id, String field, String field1, String field2) {
+ super(id, field);
+ this.field1 = field1;
+ this.field2 = field2;
+ }
+
+ public String getField1() {
+ return field1;
+ }
+
+ public String getField2() {
+ return field2;
+ }
+
+ @Override
+ public Document asLuceneDocument() {
+ Document document = super.asLuceneDocument();
+
+ document.add(new TextField(FIELD1, this.field1, Field.Store.YES));
+ document.add(new TextField(FIELD2, this.field2, Field.Store.YES));
+
+ return document;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
new file mode 100644
index 0000000..e06e8d6
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+
+/**
+ * Document with numeric field.
+ */
+@Deprecated
+public class NumericFieldDocument extends SingleFieldDocument {
+
+ public static final String NUMERIC_FIELD = "numeric";
+
+ private int numericField;
+
+ public NumericFieldDocument(String id, String field, int numericField) {
+ super(id, field);
+ this.numericField = numericField;
+ }
+
+ @Override
+ public Document asLuceneDocument() {
+ Document document = new Document();
+
+ document.add(new StringField(ID_FIELD, getId(), Field.Store.YES));
+ document.add(new TextField(FIELD, getField(), Field.Store.YES));
+ document.add(new IntField(NUMERIC_FIELD, numericField, Field.Store.YES));
+
+ return document;
+ }
+
+ public int getNumericField() {
+ return numericField;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
new file mode 100644
index 0000000..4636a51
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+
+/**
+ * Used for testing lucene2seq
+ */
+@Deprecated
+public class SingleFieldDocument implements TestDocument {
+
+ public static final String ID_FIELD = "idField";
+ public static final String FIELD = "field";
+
+ private String id;
+ private String field;
+
+ public SingleFieldDocument(String id, String field) {
+ this.id = id;
+ this.field = field;
+ }
+
+ @Override
+ public String getId() {
+ return id;
+ }
+
+ @Override
+ public String getField() {
+ return field;
+ }
+
+ @Override
+ public Document asLuceneDocument() {
+ Document document = new Document();
+
+ Field idField = new StringField(ID_FIELD, getId(), Field.Store.YES);
+ Field field = new TextField(FIELD, getField(), Field.Store.YES);
+
+ document.add(idField);
+ document.add(field);
+
+ return document;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
new file mode 100644
index 0000000..7243c71
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+@Deprecated
+public interface TestDocument {
+
+ String getId();
+
+ String getField();
+
+ Document asLuceneDocument();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
new file mode 100644
index 0000000..6eb43f6
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text.doc;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+
+/**
+ * Used for testing lucene2seq
+ */
+@Deprecated
+public class UnstoredFieldsDocument extends SingleFieldDocument {
+
+ public static final String UNSTORED_FIELD = "unstored";
+
+ public UnstoredFieldsDocument(String id, String field) {
+ super(id, field);
+ }
+
+ @Override
+ public Document asLuceneDocument() {
+ Document document = super.asLuceneDocument();
+
+ document.add(new StringField(UNSTORED_FIELD, "", Field.Store.NO));
+
+ return document;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
new file mode 100644
index 0000000..65b308f
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import com.google.common.collect.Lists;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+public class Bump125Test extends MahoutTestCase {
+ @Test
+ public void testIncrement() throws Exception {
+ Iterator<Integer> ref = Lists.newArrayList(1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 50, 60,
+ 70, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350,
+ 400, 500, 600, 700, 800, 1000, 1200, 1400, 1600, 1800,
+ 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000)
+ .iterator();
+ Bump125 b = new Bump125();
+ for (int i = 0; i < 50; i++) {
+ long x = b.increment();
+ assertEquals(ref.next().longValue(), x);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
new file mode 100644
index 0000000..7ffa690
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
@@ -0,0 +1,418 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+
+import com.google.common.io.Closeables;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.ClassifierData;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+import org.junit.Before;
+import org.junit.Test;
+
+public final class SplitInputTest extends MahoutTestCase {
+
+ private OpenObjectIntHashMap<String> countMap;
+ private Charset charset;
+ private FileSystem fs;
+ private Path tempInputFile;
+ private Path tempTrainingDirectory;
+ private Path tempTestDirectory;
+ private Path tempMapRedOutputDirectory;
+ private Path tempInputDirectory;
+ private Path tempSequenceDirectory;
+ private SplitInput si;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ Configuration conf = getConfiguration();
+ fs = FileSystem.get(conf);
+
+ super.setUp();
+
+ countMap = new OpenObjectIntHashMap<>();
+
+ charset = Charsets.UTF_8;
+ tempSequenceDirectory = getTestTempFilePath("tmpsequence");
+ tempInputFile = getTestTempFilePath("bayesinputfile");
+ tempTrainingDirectory = getTestTempDirPath("bayestrain");
+ tempTestDirectory = getTestTempDirPath("bayestest");
+ tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput");
+ tempInputDirectory = getTestTempDirPath("bayesinputdir");
+
+ si = new SplitInput();
+ si.setTrainingOutputDirectory(tempTrainingDirectory);
+ si.setTestOutputDirectory(tempTestDirectory);
+ si.setInputDirectory(tempInputDirectory);
+ }
+
+ private void writeMultipleInputFiles() throws IOException {
+ Writer writer = null;
+ String currentLabel = null;
+ try {
+ for (String[] entry : ClassifierData.DATA) {
+ if (!entry[0].equals(currentLabel)) {
+ currentLabel = entry[0];
+ Closeables.close(writer, false);
+
+ writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(tempInputDirectory, currentLabel)),
+ Charsets.UTF_8));
+ }
+ countMap.adjustOrPutValue(currentLabel, 1, 1);
+ writer.write(currentLabel + '\t' + entry[1] + '\n');
+ }
+ }finally {
+ Closeables.close(writer, false);
+ }
+ }
+
+ private void writeSingleInputFile() throws IOException {
+ Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8));
+ try {
+ for (String[] entry : ClassifierData.DATA) {
+ writer.write(entry[0] + '\t' + entry[1] + '\n');
+ }
+ } finally {
+ Closeables.close(writer, true);
+ }
+ }
+
+ @Test
+ public void testSplitDirectory() throws Exception {
+
+ writeMultipleInputFiles();
+
+ final int testSplitSize = 1;
+ si.setTestSplitSize(testSplitSize);
+ si.setCallback(new SplitInput.SplitCallback() {
+ @Override
+ public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) {
+ int trainingLines = countMap.get(inputFile.getName()) - testSplitSize;
+ assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory);
+ }
+ });
+
+ si.splitDirectory(tempInputDirectory);
+ }
+
+ @Test
+ public void testSplitFile() throws Exception {
+ writeSingleInputFile();
+ si.setTestSplitSize(2);
+ si.setCallback(new TestCallback(2, 10));
+ si.splitFile(tempInputFile);
+ }
+
+ @Test
+ public void testSplitFileLocation() throws Exception {
+ writeSingleInputFile();
+ si.setTestSplitSize(2);
+ si.setSplitLocation(50);
+ si.setCallback(new TestCallback(2, 10));
+ si.splitFile(tempInputFile);
+ }
+
+ @Test
+ public void testSplitFilePct() throws Exception {
+ writeSingleInputFile();
+ si.setTestSplitPct(25);
+
+ si.setCallback(new TestCallback(3, 9));
+ si.splitFile(tempInputFile);
+ }
+
+ @Test
+ public void testSplitFilePctLocation() throws Exception {
+ writeSingleInputFile();
+ si.setTestSplitPct(25);
+ si.setSplitLocation(50);
+ si.setCallback(new TestCallback(3, 9));
+ si.splitFile(tempInputFile);
+ }
+
+ @Test
+ public void testSplitFileRandomSelectionSize() throws Exception {
+ writeSingleInputFile();
+ si.setTestRandomSelectionSize(5);
+
+ si.setCallback(new TestCallback(5, 7));
+ si.splitFile(tempInputFile);
+ }
+
+ @Test
+ public void testSplitFileRandomSelectionPct() throws Exception {
+ writeSingleInputFile();
+ si.setTestRandomSelectionPct(25);
+
+ si.setCallback(new TestCallback(3, 9));
+ si.splitFile(tempInputFile);
+ }
+
+ /**
+ * Create a Sequencefile for testing consisting of IntWritable
+ * keys and VectorWritable values
+ * @param path path for test SequenceFile
+ * @param testPoints number of records in test SequenceFile
+ */
+ private void writeVectorSequenceFile(Path path, int testPoints) throws IOException {
+ Path tempSequenceFile = new Path(path, "part-00000");
+ Configuration conf = getConfiguration();
+ IntWritable key = new IntWritable();
+ VectorWritable value = new VectorWritable();
+ try (SequenceFile.Writer writer =
+ SequenceFile.createWriter(fs, conf, tempSequenceFile, IntWritable.class, VectorWritable.class)) {
+ for (int i = 0; i < testPoints; i++) {
+ key.set(i);
+ Vector v = new SequentialAccessSparseVector(4);
+ v.assign(i);
+ value.set(v);
+ writer.append(key, value);
+ }
+ }
+ }
+
+ /**
+ * Create a Sequencefile for testing consisting of IntWritable keys and Text values
+ * @param path path for test SequenceFile
+ * @param testPoints number of records in test SequenceFile
+ */
+ private void writeTextSequenceFile(Path path, int testPoints) throws IOException {
+ Path tempSequenceFile = new Path(path, "part-00000");
+ Configuration conf = getConfiguration();
+ Text key = new Text();
+ Text value = new Text();
+ try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, tempSequenceFile, Text.class, Text.class)){
+ for (int i = 0; i < testPoints; i++) {
+ key.set(Integer.toString(i));
+ value.set("Line " + i);
+ writer.append(key, value);
+ }
+ }
+ }
+
+ /**
+ * Display contents of a SequenceFile
+ * @param sequenceFilePath path to SequenceFile
+ */
+ private void displaySequenceFile(Path sequenceFilePath) throws IOException {
+ for (Pair<?,?> record : new SequenceFileIterable<>(sequenceFilePath, true, getConfiguration())) {
+ System.out.println(record.getFirst() + "\t" + record.getSecond());
+ }
+ }
+
+ /**
+ * Determine number of records in a SequenceFile
+ * @param sequenceFilePath path to SequenceFile
+ * @return number of records
+ */
+ private int getNumberRecords(Path sequenceFilePath) throws IOException {
+ int numberRecords = 0;
+ for (Object value : new SequenceFileValueIterable<>(sequenceFilePath, true, getConfiguration())) {
+ numberRecords++;
+ }
+ return numberRecords;
+ }
+
+ /**
+ * Test map reduce version of split input with Text, Text key value
+ * pairs in input
+ */
+ @Test
+ public void testSplitInputMapReduceText() throws Exception {
+ writeTextSequenceFile(tempSequenceDirectory, 1000);
+ testSplitInputMapReduce(1000);
+ }
+
+ /** Test map reduce version of split input with Text, Text key value pairs in input called from command line */
+ @Test
+ public void testSplitInputMapReduceTextCli() throws Exception {
+ writeTextSequenceFile(tempSequenceDirectory, 1000);
+ testSplitInputMapReduceCli(1000);
+ }
+
+ /**
+ * Test map reduce version of split input with IntWritable, Vector key value
+ * pairs in input
+ */
+ @Test
+ public void testSplitInputMapReduceVector() throws Exception {
+ writeVectorSequenceFile(tempSequenceDirectory, 1000);
+ testSplitInputMapReduce(1000);
+ }
+
+ /**
+ * Test map reduce version of split input with IntWritable, Vector key value
+ * pairs in input called from command line
+ */
+ @Test
+ public void testSplitInputMapReduceVectorCli() throws Exception {
+ writeVectorSequenceFile(tempSequenceDirectory, 1000);
+ testSplitInputMapReduceCli(1000);
+ }
+
+ /**
+ * Test map reduce version of split input through CLI
+ */
+ private void testSplitInputMapReduceCli(int numPoints) throws Exception {
+ int randomSelectionPct = 25;
+ int keepPct = 10;
+ String[] args =
+ { "--method", "mapreduce", "--input", tempSequenceDirectory.toString(),
+ "--mapRedOutputDir", tempMapRedOutputDirectory.toString(),
+ "--randomSelectionPct", Integer.toString(randomSelectionPct),
+ "--keepPct", Integer.toString(keepPct), "-ow" };
+ ToolRunner.run(getConfiguration(), new SplitInput(), args);
+ validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
+ }
+
+ /**
+ * Test map reduce version of split input through method call
+ */
+ private void testSplitInputMapReduce(int numPoints) throws Exception {
+ int randomSelectionPct = 25;
+ si.setTestRandomSelectionPct(randomSelectionPct);
+ int keepPct = 10;
+ si.setKeepPct(keepPct);
+ si.setMapRedOutputDirectory(tempMapRedOutputDirectory);
+ si.setUseMapRed(true);
+ si.splitDirectory(getConfiguration(), tempSequenceDirectory);
+
+ validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
+ }
+
+ /**
+ * Validate that number of test records and number of training records
+ * are consistant with keepPct and randomSelectionPct
+ */
+ private void validateSplitInputMapReduce(int numPoints, int randomSelectionPct, int keepPct) throws IOException {
+ Path testPath = new Path(tempMapRedOutputDirectory, "test-r-00000");
+ Path trainingPath = new Path(tempMapRedOutputDirectory, "training-r-00000");
+ int numberTestRecords = getNumberRecords(testPath);
+ int numberTrainingRecords = getNumberRecords(trainingPath);
+ System.out.printf("Test data: %d records\n", numberTestRecords);
+ displaySequenceFile(testPath);
+ System.out.printf("Training data: %d records\n", numberTrainingRecords);
+ displaySequenceFile(trainingPath);
+ assertEquals((randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
+ numberTestRecords, 2);
+ assertEquals(
+ (1 - randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
+ numberTrainingRecords, 2);
+ }
+
+ @Test
+ public void testValidate() throws Exception {
+ SplitInput st = new SplitInput();
+ assertValidateException(st);
+
+ st.setTestSplitSize(100);
+ assertValidateException(st);
+
+ st.setTestOutputDirectory(tempTestDirectory);
+ assertValidateException(st);
+
+ st.setTrainingOutputDirectory(tempTrainingDirectory);
+ st.validate();
+
+ st.setTestSplitPct(50);
+ assertValidateException(st);
+
+ st = new SplitInput();
+ st.setTestRandomSelectionPct(50);
+ st.setTestOutputDirectory(tempTestDirectory);
+ st.setTrainingOutputDirectory(tempTrainingDirectory);
+ st.validate();
+
+ st.setTestSplitPct(50);
+ assertValidateException(st);
+
+ st = new SplitInput();
+ st.setTestRandomSelectionPct(50);
+ st.setTestOutputDirectory(tempTestDirectory);
+ st.setTrainingOutputDirectory(tempTrainingDirectory);
+ st.validate();
+
+ st.setTestSplitSize(100);
+ assertValidateException(st);
+ }
+
+ private class TestCallback implements SplitInput.SplitCallback {
+ private final int testSplitSize;
+ private final int trainingLines;
+
+ private TestCallback(int testSplitSize, int trainingLines) {
+ this.testSplitSize = testSplitSize;
+ this.trainingLines = trainingLines;
+ }
+
+ @Override
+ public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) {
+ assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory);
+ }
+ }
+
+ private static void assertValidateException(SplitInput st) throws IOException {
+ try {
+ st.validate();
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException iae) {
+ // good
+ }
+ }
+
+ private static void assertSplit(FileSystem fs,
+ Path tempInputFile,
+ Charset charset,
+ int testSplitSize,
+ int trainingLines,
+ Path tempTrainingDirectory,
+ Path tempTestDirectory) {
+
+ try {
+ Path testFile = new Path(tempTestDirectory, tempInputFile.getName());
+ //assertTrue("test file exists", testFile.isFile());
+ assertEquals("test line count", testSplitSize, SplitInput.countLines(fs, testFile, charset));
+
+ Path trainingFile = new Path(tempTrainingDirectory, tempInputFile.getName());
+ //assertTrue("training file exists", trainingFile.isFile());
+ assertEquals("training line count", trainingLines, SplitInput.countLines(fs, trainingFile, charset));
+ } catch (IOException ioe) {
+ fail(ioe.toString());
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
new file mode 100644
index 0000000..c519f85
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.email;
+
+import java.io.File;
+import java.io.StringWriter;
+import java.net.URL;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class MailProcessorTest extends MahoutTestCase {
+
+ @Test
+ public void testLabel() throws Exception {
+ StringWriter writer = new StringWriter();
+ MailOptions options = new MailOptions();
+ options.setSeparator(":::");
+ options.setCharset(Charsets.UTF_8);
+ options.setPatternsToMatch(new Pattern[]{
+ MailProcessor.FROM_PREFIX, MailProcessor.SUBJECT_PREFIX, MailProcessor.TO_PREFIX});
+ options.setInput(new File(System.getProperty("user.dir")));
+ MailProcessor proc = new MailProcessor(options, "", writer);
+ URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox");
+ File file = new File(url.toURI());
+ long count = proc.parseMboxLineByLine(file);
+ assertEquals(7, count);
+ }
+
+ @Test
+ public void testStripQuoted() throws Exception {
+ StringWriter writer = new StringWriter();
+ MailOptions options = new MailOptions();
+ options.setSeparator(":::");
+ options.setCharset(Charsets.UTF_8);
+ options.setPatternsToMatch(new Pattern[]{
+ MailProcessor.SUBJECT_PREFIX});
+ options.setInput(new File(System.getProperty("user.dir")));
+ options.setIncludeBody(true);
+ MailProcessor proc = new MailProcessor(options, "", writer);
+ URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox");
+ File file = new File(url.toURI());
+ long count = proc.parseMboxLineByLine(file);
+ assertEquals(7, count);
+ assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));
+ writer = new StringWriter();
+ proc = new MailProcessor(options, "", writer);
+ options.setStripQuotedText(true);
+ count = proc.parseMboxLineByLine(file);
+ assertEquals(7, count);
+ assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
new file mode 100644
index 0000000..4fdbbbc
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
@@ -0,0 +1,154 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.util.bloom.BloomFilter;
+import org.apache.hadoop.util.bloom.Filter;
+import org.apache.hadoop.util.bloom.Key;
+import org.apache.hadoop.util.hash.Hash;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class BloomTokenFilterTest extends MahoutTestCase {
+
+ private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder();
+
+ private static final String input = "The best of times the worst of times";
+ private static final String[] allTokens = {
+ "The", "best", "of", "times", "the", "worst", "of", "times"
+ };
+ private static final String[] expectedNonKeepTokens = { "best", "times", "the", "worst", "times" };
+ private static final String[] expectedKeepTokens = { "The", "of", "of" };
+ private static final String[] filterTokens = { "The", "of" };
+ private static final String[] notFilterTokens = { "best", "worst", "the", "times"};
+ private static final String[] shingleKeepTokens = {
+ "The best", "best of times", "the worst", "worst of times", "of times"
+ };
+ private static final String[] expectedShingleTokens = {
+ "The best", "best of times", "of times", "the worst", "worst of times", "of times"
+ };
+
+ /** test standalone filter without tokenfilter wrapping */
+ @Test
+ public void testFilter() throws IOException {
+ Filter filter = getFilter(filterTokens);
+ Key k = new Key();
+ for (String s: filterTokens) {
+ setKey(k,s);
+ assertTrue("Key for string " + s + " should be filter member", filter.membershipTest(k));
+ }
+
+ for (String s: notFilterTokens) {
+ setKey(k,s);
+ assertFalse("Key for string " + s + " should not be filter member", filter.membershipTest(k));
+ }
+ }
+
+ /** normal case, unfiltered analyzer */
+ @Test
+ public void testAnalyzer() throws IOException {
+ Reader reader = new StringReader(input);
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ ts.reset();
+ validateTokens(allTokens, ts);
+ ts.end();
+ ts.close();
+ }
+
+ /** filtered analyzer */
+ @Test
+ public void testNonKeepdAnalyzer() throws IOException {
+ Reader reader = new StringReader(input);
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ ts.reset();
+ TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts);
+ validateTokens(expectedNonKeepTokens, f);
+ ts.end();
+ ts.close();
+ }
+
+ /** keep analyzer */
+ @Test
+ public void testKeepAnalyzer() throws IOException {
+ Reader reader = new StringReader(input);
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ ts.reset();
+ TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts);
+ validateTokens(expectedKeepTokens, f);
+ ts.end();
+ ts.close();
+ }
+
+ /** shingles, keep those matching whitelist */
+ @Test
+ public void testShingleFilteredAnalyzer() throws IOException {
+ Reader reader = new StringReader(input);
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ ts.reset();
+ ShingleFilter sf = new ShingleFilter(ts, 3);
+ TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf);
+ validateTokens(expectedShingleTokens, f);
+ ts.end();
+ ts.close();
+ }
+
+ private static void setKey(Key k, String s) throws IOException {
+ ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray()));
+ k.set(buffer.array(), 1.0);
+ }
+
+ private static void validateTokens(String[] expected, TokenStream ts) throws IOException {
+ int pos = 0;
+ while (ts.incrementToken()) {
+ assertTrue("Analyzer produced too many tokens", pos <= expected.length);
+ CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
+ assertEquals("Unexpected term", expected[pos++], termAttr.toString());
+ }
+ assertEquals("Analyzer produced too few terms", expected.length, pos);
+ }
+
+ private static Filter getFilter(String[] tokens) throws IOException {
+ Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH);
+ Key k = new Key();
+ for (String s: tokens) {
+ setKey(k,s);
+ filter.add(k);
+ }
+ return filter;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
new file mode 100644
index 0000000..8ab643b
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.List;
+
+public final class RegexMapperTest extends MahoutTestCase {
+
+ @Test
+ public void testRegex() throws Exception {
+ RegexMapper mapper = new RegexMapper();
+ Configuration conf = getConfiguration();
+ conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+ conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
+ DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
+ Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
+
+ mapper.setup(mapContext);
+ for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+ String testStr = RegexUtilsTest.TEST_STRS[i];
+
+ LongWritable key = new LongWritable(i);
+ mapper.map(key, new Text(testStr), mapContext);
+ List<Text> value = mapWriter.getValue(key);
+ if (!RegexUtilsTest.GOLD[i].isEmpty()) {
+ assertEquals(1, value.size());
+ assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString());
+ }
+ }
+ }
+
+ @Test
+ public void testGroups() throws Exception {
+ RegexMapper mapper = new RegexMapper();
+ Configuration conf = getConfiguration();
+ conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)");
+ conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
+ conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3");
+ DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
+ Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
+
+ mapper.setup(mapContext);
+ for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+ String testStr = RegexUtilsTest.TEST_STRS[i];
+
+ LongWritable key = new LongWritable(i);
+ mapper.map(key, new Text(testStr), mapContext);
+ List<Text> value = mapWriter.getValue(key);
+ assertEquals(1, value.size());
+ assertEquals("127 0", value.get(0).toString());
+ }
+ }
+
+ @Test
+ public void testFPGFormatter() throws Exception {
+ RegexMapper mapper = new RegexMapper();
+ Configuration conf = getConfiguration();
+ conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+ conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
+ conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName());
+ DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
+ Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
+
+ mapper.setup(mapContext);
+ RegexFormatter formatter = new FPGFormatter();
+ for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+ String testStr = RegexUtilsTest.TEST_STRS[i];
+
+ LongWritable key = new LongWritable(i);
+ mapper.map(key, new Text(testStr), mapContext);
+ List<Text> value = mapWriter.getValue(key);
+ if (!RegexUtilsTest.GOLD[i].isEmpty()) {
+ assertEquals(1, value.size());
+ assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), value.get(0).toString());
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
new file mode 100644
index 0000000..8ae10a5
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.regex.Pattern;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class RegexUtilsTest extends MahoutTestCase {
+
+ static final String[] TEST_STRS = {
+ "127.0.0.1 - - [01/10/2011:00:01:51 +0000] \"GET /solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content",
+ "127.0.0.1 - - [01/10/2011:00:20:58 +0000] \"GET /solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10",
+ "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 45071",
+ "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071"
+ };
+ static final String[] GOLD = {"foo", "Using Solr Search RDBMS", "language detection", ""};
+
+ @Test
+ public void testExtract() throws Exception {
+ Pattern pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)");
+ String line = "127.0.0.1 - - [24/05/2010:01:19:22 +0000] \"GET /solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571";
+ String res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
+ assertEquals(res, "import statement", res);
+
+ for (int i = 0; i < TEST_STRS.length; i++) {
+ String testStr = TEST_STRS[i];
+ res = RegexUtils.extract(testStr, pattern, Collections.<Integer>emptyList(), " ", new URLDecodeTransformer());
+ assertEquals(GOLD[i], res);
+ }
+
+ pattern = Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))");
+ res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
+ assertEquals(res, "import statement 1", res);
+
+ pattern = Pattern.compile("(start=1) HTTP");
+ Collection<Integer> groupsToKeep = new ArrayList<>();
+ groupsToKeep.add(1);
+ res = RegexUtils.extract(line, pattern, groupsToKeep, " ", RegexUtils.IDENTITY_TRANSFORMER);
+ assertEquals(res, "start=1", res);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
new file mode 100644
index 0000000..2ddce14
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import java.util.Iterator;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.CountingIterator;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleFunction;
+
+public final class RandomVectorIterable implements Iterable<Vector> {
+
+ public enum VectorType {DENSE, SPARSE}
+
+ private final int numItems;
+ private final VectorType type;
+
+ public RandomVectorIterable() {
+ this(100, VectorType.SPARSE);
+ }
+
+ public RandomVectorIterable(int numItems) {
+ this(numItems, VectorType.SPARSE);
+ }
+
+ public RandomVectorIterable(int numItems, VectorType type) {
+ this.numItems = numItems;
+ this.type = type;
+ }
+
+ @Override
+ public Iterator<Vector> iterator() {
+ return Iterators.transform(
+ new CountingIterator(numItems),
+ new Function<Integer, Vector>() {
+ private final Random random = RandomUtils.getRandom();
+ @Override
+ public Vector apply(Integer dummy) {
+ Vector result =
+ type == VectorType.SPARSE ? new RandomAccessSparseVector(numItems) : new DenseVector(numItems);
+ result.assign(new DoubleFunction() {
+ @Override
+ public double apply(double ignored) {
+ return random.nextDouble();
+ }
+ });
+ return result;
+ }
+ });
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
new file mode 100644
index 0000000..c55fd8d
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Before;
+import org.junit.Test;
+
+public final class VectorHelperTest extends MahoutTestCase {
+
+ private static final int NUM_DOCS = 100;
+
+ private Path inputPathOne;
+ private Path inputPathTwo;
+
+ private Configuration conf;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ conf = getConfiguration();
+
+ inputPathOne = getTestTempFilePath("documents/docs-one.file");
+ FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf);
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class)) {
+ Random rd = RandomUtils.getRandom();
+ for (int i = 0; i < NUM_DOCS; i++) {
+ // Make all indices higher than dictionary size
+ writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS)));
+ }
+ }
+
+ inputPathTwo = getTestTempFilePath("documents/docs-two.file");
+ fs = FileSystem.get(inputPathTwo.toUri(), conf);
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class)) {
+ Random rd = RandomUtils.getRandom();
+ for (int i = 0; i < NUM_DOCS; i++) {
+ // Keep indices within number of documents
+ writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS)));
+ }
+ }
+ }
+
+ @Test
+ public void testJsonFormatting() throws Exception {
+ Vector v = new SequentialAccessSparseVector(10);
+ v.set(2, 3.1);
+ v.set(4, 1.0);
+ v.set(6, 8.1);
+ v.set(7, -100);
+ v.set(9, 12.2);
+ String UNUSED = "UNUSED";
+ String[] dictionary = {
+ UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, "nine"
+ };
+
+ assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}",
+ VectorHelper.vectorToJson(v, dictionary, 3, true));
+ assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
+ VectorHelper.vectorToJson(v, dictionary, 2, false));
+ assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0}",
+ VectorHelper.vectorToJson(v, dictionary, 4, true));
+ assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}",
+ VectorHelper.vectorToJson(v, dictionary, 5, true));
+ assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}",
+ VectorHelper.vectorToJson(v, dictionary, 2, true));
+ assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
+ VectorHelper.vectorToJson(v, dictionary, 2, false));
+ }
+
+ @Test
+ public void testTopEntries() throws Exception {
+ Vector v = new SequentialAccessSparseVector(10);
+ v.set(2, 3.1);
+ v.set(4, 1.0);
+ v.set(6, 8.1);
+ v.set(7, -100);
+ v.set(9, 12.2);
+ v.set(1, 0.0);
+ v.set(3, 0.0);
+ v.set(8, 2.7);
+ // check if sizeOFNonZeroElementsInVector = maxEntries
+ assertEquals(6, VectorHelper.topEntries(v, 6).size());
+ // check if sizeOfNonZeroElementsInVector < maxEntries
+ assertTrue(VectorHelper.topEntries(v, 9).size() < 9);
+ // check if sizeOfNonZeroElementsInVector > maxEntries
+ assertTrue(VectorHelper.topEntries(v, 5).size() < v.getNumNonZeroElements());
+ }
+
+ @Test
+ public void testTopEntriesWhenAllZeros() throws Exception {
+ Vector v = new SequentialAccessSparseVector(10);
+ v.set(2, 0.0);
+ v.set(4, 0.0);
+ v.set(6, 0.0);
+ v.set(7, 0);
+ v.set(9, 0.0);
+ v.set(1, 0.0);
+ v.set(3, 0.0);
+ v.set(8, 0.0);
+ assertEquals(0, VectorHelper.topEntries(v, 6).size());
+ }
+
+ @Test
+ public void testLoadTermDictionary() throws Exception {
+ // With indices higher than dictionary size
+ VectorHelper.loadTermDictionary(conf, inputPathOne.toString());
+ // With dictionary size higher than indices
+ VectorHelper.loadTermDictionary(conf, inputPathTwo.toString());
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
new file mode 100644
index 0000000..2ea8b89
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class ARFFTypeTest extends MahoutTestCase {
+
+ @Test
+ public void removeQuotes() {
+ assertNull(ARFFType.removeQuotes(null));
+ assertEquals("", ARFFType.removeQuotes("\"\""));
+ assertEquals("", ARFFType.removeQuotes("''"));
+ assertEquals("", ARFFType.removeQuotes(""));
+ assertEquals("", ARFFType.removeQuotes(" "));
+ assertEquals("single", ARFFType.removeQuotes("'single'"));
+ assertEquals("double", ARFFType.removeQuotes("\"double\""));
+ assertEquals("trim", ARFFType.removeQuotes(" trim "));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
new file mode 100644
index 0000000..4c7f17a
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
@@ -0,0 +1,289 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import java.io.IOException;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Map;
+
+import com.google.common.io.Resources;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Test;
+
+public final class ARFFVectorIterableTest extends MahoutTestCase {
+
+ @Test
+ public void testValues() throws Exception {
+ ARFFVectorIterable iterable = readModelFromResource("sample.arff");
+
+ assertEquals("Mahout", iterable.getModel().getRelation());
+ Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
+ assertNotNull(bindings);
+ assertEquals(5, bindings.size());
+ Iterator<Vector> iter = iterable.iterator();
+ assertTrue(iter.hasNext());
+ Vector next = iter.next();
+ assertNotNull(next);
+ assertTrue("Wrong instanceof", next instanceof DenseVector);
+ assertEquals(1.0, next.get(0), EPSILON);
+ assertEquals(2.0, next.get(1), EPSILON);
+ assertTrue(iter.hasNext());
+ next = iter.next();
+ assertNotNull(next);
+ assertTrue("Wrong instanceof", next instanceof DenseVector);
+ assertEquals(2.0, next.get(0), EPSILON);
+ assertEquals(3.0, next.get(1), EPSILON);
+
+ assertTrue(iter.hasNext());
+ next = iter.next();
+ assertNotNull(next);
+ assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector);
+ assertEquals(5.0, next.get(0), EPSILON);
+ assertEquals(23.0, next.get(1), EPSILON);
+
+ assertFalse(iter.hasNext());
+ }
+
+ @Test
+ public void testDense() throws Exception {
+ Iterable<Vector> iterable = readModelFromResource("sample-dense.arff");
+ Vector firstVector = iterable.iterator().next();
+ assertEquals(1.0, firstVector.get(0), 0);
+ assertEquals(65.0, firstVector.get(1), 0);
+ assertEquals(1.0, firstVector.get(3), 0);
+ assertEquals(1.0, firstVector.get(4), 0);
+
+ int count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof DenseVector);
+ count++;
+ }
+ assertEquals(5, count);
+ }
+
+ @Test
+ public void testSparse() throws Exception {
+ Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff");
+
+ Vector firstVector = iterable.iterator().next();
+ assertEquals(23.1, firstVector.get(1), 0);
+ assertEquals(3.23, firstVector.get(2), 0);
+ assertEquals(1.2, firstVector.get(3), 0);
+
+ int count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
+ count++;
+ }
+ assertEquals(9, count);
+ }
+
+ @Test
+ public void testNonNumeric() throws Exception {
+ MapBackedARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
+ int count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
+ count++;
+ }
+
+ iterable = getVectors("non-numeric-1.arff", model);
+ Iterator<Vector> iter = iterable.iterator();
+ Vector firstVector = iter.next();
+
+ assertEquals(1.0, firstVector.get(2), 0);
+
+ assertEquals(10, count);
+ Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
+ assertNotNull(nominalMap);
+ assertEquals(1, nominalMap.size());
+ Map<String, Integer> noms = nominalMap.get("bar");
+ assertNotNull("nominals for bar are null", noms);
+ assertEquals(5, noms.size());
+ Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
+ assertNotNull("Type map null", integerARFFTypeMap);
+ assertEquals(5, integerARFFTypeMap.size());
+ Map<String, Long> words = model.getWords();
+ assertNotNull("words null", words);
+ assertEquals(10, words.size());
+ Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
+ assertNotNull("date format null", integerDateFormatMap);
+ assertEquals(1, integerDateFormatMap.size());
+ }
+
+ @Test
+ public void testDate() throws Exception {
+ ARFFVectorIterable iterable = readModelFromResource("date.arff");
+ Iterator<Vector> iter = iterable.iterator();
+ Vector firstVector = iter.next();
+
+ DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
+ Date date = format.parse("2001-07-04T12:08:56");
+ long result = date.getTime();
+ assertEquals(result, firstVector.get(1), 0);
+
+ format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", Locale.ENGLISH);
+ date = format.parse("2001.07.04 AD at 12:08:56 PDT");
+ result = date.getTime();
+ assertEquals(result, firstVector.get(2), 0);
+
+ format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH);
+ date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT");
+ result = date.getTime();
+ assertEquals(result, firstVector.get(3), 0);
+
+ format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH);
+ date = format.parse("0:08 PM, PDT");
+ result = date.getTime();
+ assertEquals(result, firstVector.get(4), 0);
+
+ format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", Locale.ENGLISH);
+ date = format.parse("02001.July.04 AD 12:08 PM");
+ result = date.getTime();
+ assertEquals(result, firstVector.get(5), 0);
+
+ format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH);
+ date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700");
+ result = date.getTime();
+ assertEquals(result, firstVector.get(6), 0);
+
+ }
+
+ @Test
+ public void testMultipleNoms() throws Exception {
+ MapBackedARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
+ int count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
+ count++;
+ }
+ assertEquals(10, count);
+ Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap();
+ assertNotNull(nominalMap);
+ assertEquals(1, nominalMap.size());
+ Map<String,Integer> noms = nominalMap.get("bar");
+ assertNotNull("nominals for bar are null", noms);
+ assertEquals(5, noms.size());
+ Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
+ assertNotNull("Type map null", integerARFFTypeMap);
+ assertEquals(5, integerARFFTypeMap.size());
+ Map<String,Long> words = model.getWords();
+ assertNotNull("words null", words);
+ assertEquals(10, words.size());
+
+ Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap();
+ assertNotNull("date format null", integerDateFormatMap);
+ assertEquals(1, integerDateFormatMap.size());
+
+
+ iterable = getVectors("non-numeric-2.arff", model);
+ count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
+ count++;
+ }
+ nominalMap = model.getNominalMap();
+ assertNotNull(nominalMap);
+ assertEquals(2, nominalMap.size());
+ noms = nominalMap.get("test");
+ assertNotNull("nominals for bar are null", noms);
+ assertEquals(2, noms.size());
+ }
+
+ @Test
+ public void testNumerics() throws Exception {
+ String arff = "@RELATION numerics\n"
+ + "@ATTRIBUTE theNumeric NUMERIC\n"
+ + "@ATTRIBUTE theInteger INTEGER\n"
+ + "@ATTRIBUTE theReal REAL\n"
+ + "@DATA\n"
+ + "1.0,2,3.0";
+ ARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
+ model = iterable.getModel();
+ assertNotNull(model);
+ assertEquals(3, model.getLabelSize());
+ assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
+ assertEquals(ARFFType.INTEGER, model.getARFFType(1));
+ assertEquals(ARFFType.REAL, model.getARFFType(2));
+ Iterator<Vector> it = iterable.iterator();
+ Vector vector = it.next();
+ assertEquals(1.0, vector.get(0), EPSILON);
+ assertEquals(2.0, vector.get(1), EPSILON);
+ assertEquals(3.0, vector.get(2), EPSILON);
+ }
+
+ @Test
+ public void testQuotes() throws Exception {
+ // ARFF allows quotes on identifiers
+ ARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterable iterable = getVectors("quoted-id.arff", model);
+ model = iterable.getModel();
+ assertNotNull(model);
+ assertEquals("quotes", model.getRelation());
+
+ // check attribute labels
+ assertEquals(4, model.getLabelSize());
+ assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
+ assertEquals(ARFFType.INTEGER, model.getARFFType(1));
+ assertEquals(ARFFType.REAL, model.getARFFType(2));
+ assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
+
+ Map<String, Integer> labelBindings = model.getLabelBindings();
+ assertTrue(labelBindings.keySet().contains("thenumeric"));
+ assertTrue(labelBindings.keySet().contains("theinteger"));
+ assertTrue(labelBindings.keySet().contains("thereal"));
+ assertTrue(labelBindings.keySet().contains("thenominal"));
+
+ // check nominal values
+ Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
+ assertNotNull(nominalMap);
+ assertEquals(3, nominalMap.size());
+ assertTrue(nominalMap.keySet().contains("double-quote"));
+ assertTrue(nominalMap.keySet().contains("single-quote"));
+ assertTrue(nominalMap.keySet().contains("no-quote"));
+
+ // check data values
+ Iterator<Vector> it = iterable.iterator();
+ Vector vector = it.next();
+ assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
+ assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
+ assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
+ }
+
+ static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException {
+ String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8);
+ return new ARFFVectorIterable(sample, model);
+ }
+
+ private static ARFFVectorIterable readModelFromResource(String resourceName) throws IOException {
+ ARFFModel model = new MapBackedARFFModel();
+ return getVectors(resourceName, model);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
new file mode 100644
index 0000000..7e7623e
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils.vectors.arff;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+import com.google.common.io.Resources;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+/**
+ * Test case for {@link Driver}
+ */
+public class DriverTest extends MahoutTestCase {
+
+ @Test
+ public void dictionary() throws IOException {
+
+ ARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
+ StringWriter writer = new StringWriter();
+ Driver.writeLabelBindings(writer, model, ",");
+ String expected1 = Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), Charsets.UTF_8);
+ String expected2 = Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), Charsets.UTF_8);
+ assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString()));
+ }
+
+
+ @Test
+ public void dictionaryJSON() throws IOException {
+ ARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
+ StringWriter writer = new StringWriter();
+ Driver.writeLabelBindingsJSON(writer, model);
+ String expected1 = Resources.toString(Resources.getResource("expected-arff-schema.json"), Charsets.UTF_8);
+ String expected2 = Resources.toString(Resources.getResource("expected-arff-schema-2.json"), Charsets.UTF_8);
+ assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString()));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
new file mode 100644
index 0000000..2867640
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.Map;
+
+public class MapBackedARFFModelTest extends MahoutTestCase {
+
+ @Test
+ public void processNominal() {
+ String windy = "windy";
+ String breezy = "breezy";
+
+ ARFFModel model = new MapBackedARFFModel();
+ model.addNominal(windy, breezy, 77);
+ model.addNominal(windy, "strong", 23);
+ model.addNominal(windy, "nuking", 55);
+ Map<String, Map<String, Integer>> nominalMap = model.getNominalMap();
+
+ assertEquals(1, nominalMap.size());
+ Map<String, Integer> windyValues = nominalMap.get(windy);
+ assertEquals(77, windyValues.get(breezy).intValue());
+ }
+
+ @Test
+ public void processBadNumeric() {
+ ARFFModel model = new MapBackedARFFModel();
+ model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77);
+ model.addType(77, ARFFType.REAL);
+ assertTrue(Double.isNaN(model.getValue("b1shkt70694difsmmmdv0ikmoh", 77)));
+ }
+
+ @Test
+ public void processGoodNumeric() {
+ ARFFModel model = new MapBackedARFFModel();
+ model.addLabel("1234", 77);
+ model.addType(77, ARFFType.INTEGER);
+ assertTrue(1234 == model.getValue("1234", 77));
+ model.addLabel("131.34", 78);
+ model.addType(78, ARFFType.REAL);
+ assertTrue(131.34 == model.getValue("131.34", 78));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
new file mode 100644
index 0000000..e76cf70
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.csv;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.Iterator;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.vectors.RandomVectorIterable;
+import org.apache.mahout.utils.vectors.VectorHelper;
+import org.apache.mahout.utils.vectors.io.TextualVectorWriter;
+import org.junit.Test;
+
+public class CSVVectorIteratorTest extends MahoutTestCase {
+
+ @Test
+ public void testCount() throws Exception {
+
+ StringWriter sWriter = new StringWriter();
+ try (TextualVectorWriter writer = new TextualVectorWriter(sWriter) {
+ @Override
+ public void write(Vector vector) throws IOException {
+ String vecStr = VectorHelper.vectorToCSVString(vector, false);
+ getWriter().write(vecStr);
+ }
+ }) {
+ Iterable<Vector> iter = new RandomVectorIterable(50);
+ writer.write(iter);
+ }
+
+ Iterator<Vector> csvIter = new CSVVectorIterator(new StringReader(sWriter.getBuffer().toString()));
+ int count = 0;
+ while (csvIter.hasNext()) {
+ csvIter.next();
+ count++;
+ }
+ assertEquals(50, count);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
new file mode 100644
index 0000000..e2f7032
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.io;
+
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.vectors.RandomVectorIterable;
+import org.junit.Test;
+
+public final class VectorWriterTest extends MahoutTestCase {
+
+ @Test
+ public void testSFVW() throws Exception {
+ Path path = getTestTempFilePath("sfvw");
+ Configuration conf = getConfiguration();
+ FileSystem fs = FileSystem.get(conf);
+ SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
+ try (SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter)) {
+ writer.write(new RandomVectorIterable(50));
+ }
+
+ long count = HadoopUtil.countRecords(path, conf);
+ assertEquals(50, count);
+ }
+
+ @Test
+ public void testTextOutputSize() throws Exception {
+ StringWriter strWriter = new StringWriter();
+ try (VectorWriter writer = new TextualVectorWriter(strWriter)) {
+ Collection<Vector> vectors = new ArrayList<>();
+ vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5}));
+ vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5}));
+ writer.write(vectors);
+ }
+ String buffer = strWriter.toString();
+ assertNotNull(buffer);
+ assertFalse(buffer.isEmpty());
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
new file mode 100644
index 0000000..890a14b
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+
+import java.io.IOException;
+
+import com.google.common.io.Closeables;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+public class CachedTermInfoTest extends MahoutTestCase {
+ private RAMDirectory directory;
+ private static final String[] DOCS = {
+ "a a b b c c",
+ "a b a b a b a b",
+ "a b a",
+ "a",
+ "b",
+ "a",
+ "a"
+ };
+
+ private static final String[] DOCS2 = {
+ "d d d d",
+ "e e e e",
+ "d e d e",
+ "d",
+ "e",
+ "d",
+ "e"
+ };
+
+ @Before
+ public void before() throws IOException {
+ directory = new RAMDirectory();
+
+ FieldType fieldType = new FieldType();
+ fieldType.setStored(false);
+ fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ fieldType.setTokenized(true);
+ fieldType.setStoreTermVectors(false);
+ fieldType.setStoreTermVectorPositions(false);
+ fieldType.setStoreTermVectorOffsets(false);
+ fieldType.freeze();
+
+ directory = createTestIndex(fieldType, directory, 0);
+ }
+
+ @Test
+ public void test() throws Exception {
+ IndexReader reader = DirectoryReader.open(directory);
+ CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100);
+ assertEquals(3, cti.totalTerms("content"));
+ assertNotNull(cti.getTermEntry("content", "a"));
+ assertNull(cti.getTermEntry("content", "e"));
+ //minDf
+ cti = new CachedTermInfo(reader, "content", 3, 100);
+ assertEquals(2, cti.totalTerms("content"));
+ assertNotNull(cti.getTermEntry("content", "a"));
+ assertNull(cti.getTermEntry("content", "c"));
+ //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to exclude, 85% should suffice to exclude a
+ cti = new CachedTermInfo(reader, "content", 0, 85);
+ assertEquals(2, cti.totalTerms("content"));
+ assertNotNull(cti.getTermEntry("content", "b"));
+ assertNotNull(cti.getTermEntry("content", "c"));
+ assertNull(cti.getTermEntry("content", "a"));
+
+
+ }
+
+ static RAMDirectory createTestIndex(FieldType fieldType,
+ RAMDirectory directory,
+ int startingId) throws IOException {
+ IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer()));
+
+ try {
+ for (int i = 0; i < DOCS.length; i++) {
+ Document doc = new Document();
+ Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
+ doc.add(id);
+ Field text = new Field("content", DOCS[i], fieldType);
+ doc.add(text);
+ Field text2 = new Field("content2", DOCS2[i], fieldType);
+ doc.add(text2);
+ writer.addDocument(doc);
+ }
+ } finally {
+ Closeables.close(writer, false);
+ }
+ return directory;
+ }
+}

r***@apache.org

2018-06-27 14:52:07 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
new file mode 100644
index 0000000..86c8305
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import com.google.common.collect.Sets;
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.SimpleFSDirectory;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.Set;
+
+public class DriverTest extends MahoutTestCase {
+
+ private File indexDir;
+ private File outputDir;
+ private Configuration conf;
+
+ @Before
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ indexDir = getTestTempDir("intermediate");
+ indexDir.delete();
+ outputDir = getTestTempDir("output");
+ outputDir.delete();
+
+ conf = getConfiguration();
+ }
+
+ private Document asDocument(String line) {
+ Document doc = new Document();
+ doc.add(new TextFieldWithTermVectors("text", line));
+ return doc;
+ }
+
+ static class TextFieldWithTermVectors extends Field {
+
+ public static final FieldType TYPE = new FieldType();
+
+ static {
+ TYPE.setOmitNorms(true);
+ TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+ TYPE.setStored(true);
+ TYPE.setTokenized(true);
+ TYPE.setStoreTermVectors(true);
+ TYPE.freeze();
+ }
+
+ public TextFieldWithTermVectors(String name, String value) {
+ super(name, value, TYPE);
+ }
+ }
+
+ @Test
+ public void sequenceFileDictionary() throws IOException {
+
+ Directory index = new SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath()));
+ Analyzer analyzer = new StandardAnalyzer();
+ IndexWriterConfig config = new IndexWriterConfig(analyzer);
+ config.setCommitOnClose(true);
+ final IndexWriter writer = new IndexWriter(index, config);
+
+ try {
+ writer.addDocument(asDocument("One Ring to rule them all"));
+ writer.addDocument(asDocument("One Ring to find them,"));
+ writer.addDocument(asDocument("One Ring to bring them all"));
+ writer.addDocument(asDocument("and in the darkness bind them"));
+ } finally {
+ writer.close();
+ }
+
+ File seqDict = new File(outputDir, "dict.seq");
+
+ Driver.main(new String[] {
+ "--dir", indexDir.getAbsolutePath(),
+ "--output", new File(outputDir, "out").getAbsolutePath(),
+ "--field", "text",
+ "--dictOut", new File(outputDir, "dict.txt").getAbsolutePath(),
+ "--seqDictOut", seqDict.getAbsolutePath(),
+ });
+
+ SequenceFile.Reader reader = null;
+ Set<String> indexTerms = Sets.newHashSet();
+ try {
+ reader = new SequenceFile.Reader(FileSystem.getLocal(conf), new Path(seqDict.getAbsolutePath()), conf);
+ Text term = new Text();
+ IntWritable termIndex = new IntWritable();
+
+ while (reader.next(term, termIndex)) {
+ indexTerms.add(term.toString());
+ }
+ } finally {
+ Closeables.close(reader, true);
+ }
+
+ Set<String> expectedIndexTerms = Sets.newHashSet("all", "bind", "bring", "darkness", "find", "one", "ring", "rule");
+
+ // should contain the same terms as expected
+ assertEquals(expectedIndexTerms.size(), Sets.union(expectedIndexTerms, indexTerms).size());
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
new file mode 100644
index 0000000..8d92551
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
@@ -0,0 +1,195 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.vectorizer.TFIDF;
+import org.apache.mahout.vectorizer.Weight;
+import org.junit.Before;
+import org.junit.Test;
+
+public final class LuceneIterableTest extends MahoutTestCase {
+
+ private static final String [] DOCS = {
+ "The quick red fox jumped over the lazy brown dogs.",
+ "Mary had a little lamb whose fleece was white as snow.",
+ "Moby Dick is a story of a whale and a man obsessed.",
+ "The robber wore a black fleece jacket and a baseball cap.",
+ "The English Springer Spaniel is the best of all dogs."
+ };
+
+ private RAMDirectory directory;
+
+ private final FieldType TYPE_NO_TERM_VECTORS = new FieldType();
+
+ private final FieldType TYPE_TERM_VECTORS = new FieldType();
+
+ @Before
+ public void before() throws IOException {
+
+ TYPE_NO_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ TYPE_NO_TERM_VECTORS.setTokenized(true);
+ TYPE_NO_TERM_VECTORS.setStoreTermVectors(false);
+ TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false);
+ TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false);
+ TYPE_NO_TERM_VECTORS.freeze();
+
+ TYPE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ TYPE_TERM_VECTORS.setTokenized(true);
+ TYPE_TERM_VECTORS.setStored(true);
+ TYPE_TERM_VECTORS.setStoreTermVectors(true);
+ TYPE_TERM_VECTORS.setStoreTermVectorPositions(true);
+ TYPE_TERM_VECTORS.setStoreTermVectorOffsets(true);
+ TYPE_TERM_VECTORS.freeze();
+
+ directory = createTestIndex(TYPE_TERM_VECTORS);
+ }
+
+ @Test
+ public void testIterable() throws Exception {
+ IndexReader reader = DirectoryReader.open(directory);
+ Weight weight = new TFIDF();
+ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+ LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
+
+ //TODO: do something more meaningful here
+ for (Vector vector : iterable) {
+ assertNotNull(vector);
+ assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector);
+ assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0);
+ assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
+ }
+
+ iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, 3);
+
+ //TODO: do something more meaningful here
+ for (Vector vector : iterable) {
+ assertNotNull(vector);
+ assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector);
+ assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0);
+ assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
+ }
+
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testIterableNoTermVectors() throws IOException {
+ RAMDirectory directory = createTestIndex(TYPE_NO_TERM_VECTORS);
+ IndexReader reader = DirectoryReader.open(directory);
+
+ Weight weight = new TFIDF();
+ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+ LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
+
+ Iterator<Vector> iterator = iterable.iterator();
+ Iterators.advance(iterator, 1);
+ }
+
+ @Test
+ public void testIterableSomeNoiseTermVectors() throws IOException {
+ //get noise vectors
+ RAMDirectory directory = createTestIndex(TYPE_TERM_VECTORS, new RAMDirectory(), 0);
+ //get real vectors
+ createTestIndex(TYPE_NO_TERM_VECTORS, directory, 5);
+ IndexReader reader = DirectoryReader.open(directory);
+
+ Weight weight = new TFIDF();
+ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+
+ boolean exceptionThrown;
+ //0 percent tolerance
+ LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo, weight);
+ try {
+ Iterables.skip(iterable, Iterables.size(iterable));
+ exceptionThrown = false;
+ }
+ catch(IllegalStateException ise) {
+ exceptionThrown = true;
+ }
+ assertTrue(exceptionThrown);
+
+ //100 percent tolerance
+ iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 1.0);
+ try {
+ Iterables.skip(iterable, Iterables.size(iterable));
+ exceptionThrown = false;
+ }
+ catch(IllegalStateException ise) {
+ exceptionThrown = true;
+ }
+ assertFalse(exceptionThrown);
+
+ //50 percent tolerance
+ iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 0.5);
+ Iterator<Vector> iterator = iterable.iterator();
+ Iterators.advance(iterator, 5);
+
+ try {
+ Iterators.advance(iterator, Iterators.size(iterator));
+ exceptionThrown = false;
+ }
+ catch(IllegalStateException ise) {
+ exceptionThrown = true;
+ }
+ assertTrue(exceptionThrown);
+ }
+
+ static RAMDirectory createTestIndex(FieldType fieldType) throws IOException {
+ return createTestIndex(fieldType, new RAMDirectory(), 0);
+ }
+
+ static RAMDirectory createTestIndex(FieldType fieldType,
+ RAMDirectory directory,
+ int startingId) throws IOException {
+
+ try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()))) {
+ for (int i = 0; i < DOCS.length; i++) {
+ Document doc = new Document();
+ Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
+ doc.add(id);
+ //Store both position and offset information
+ Field text = new Field("content", DOCS[i], fieldType);
+ doc.add(text);
+ Field text2 = new Field("content2", DOCS[i], fieldType);
+ doc.add(text2);
+ writer.addDocument(doc);
+ }
+ }
+ return directory;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/date.arff
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/date.arff b/community/mahout-mr/integration/src/test/resources/date.arff
new file mode 100644
index 0000000..9daeb52
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/date.arff
@@ -0,0 +1,18 @@
+ % Comments
+ %
+ % Comments go here %
+ @RELATION MahoutDateTest
+
+ @ATTRIBUTE junk NUMERIC
+ @ATTRIBUTE date1 date
+ @ATTRIBUTE date2 date "yyyy.MM.dd G 'at' HH:mm:ss z"
+ @ATTRIBUTE date3 date "EEE, MMM d, ''yy"
+ @ATTRIBUTE date4 date "K:mm a, z"
+ @ATTRIBUTE date5 date "yyyyy.MMMMM.dd GGG hh:mm aaa"
+ @ATTRIBUTE date6 date "EEE, d MMM yyyy HH:mm:ss Z"
+
+
+
+ @DATA
+ {0 1,1 "2001-07-04T12:08:56",2 "2001.07.04 AD at 12:08:56 PDT",3 "Wed, Jul 4, '01,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.04 AD 12:08 PM" ,6 "Wed, 4 Jul 2001 12:08:56 -0700" }
+ {0 2,1 "2001-08-04T12:09:56",2 "2011.07.04 AD at 12:08:56 PDT",3 "Mon, Jul 4, '11,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.14 AD 12:08 PM" ,6 "Mon, 4 Jul 2011 12:08:56 -0700" }

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary-2.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary-2.csv b/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary-2.csv
new file mode 100644
index 0000000..acb1c43
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary-2.csv
@@ -0,0 +1,22 @@
+Label bindings for Relation golf
+temperature,1
+humidity,2
+outlook,0
+class,4
+windy,3
+
+Values for nominal attributes
+3
+outlook
+3
+rain,3
+overcast,2
+sunny,1
+class
+2
+play,2
+dont_play,1
+windy
+2
+false,1
+true,2

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary.csv b/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary.csv
new file mode 100644
index 0000000..f2dac13
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/expected-arff-dictionary.csv
@@ -0,0 +1,22 @@
+Label bindings for Relation golf
+humidity,2
+windy,3
+outlook,0
+class,4
+temperature,1
+
+Values for nominal attributes
+3
+windy
+2
+true,2
+false,1
+outlook
+3
+sunny,1
+overcast,2
+rain,3
+class
+2
+play,2
+dont_play,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/expected-arff-schema-2.json
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/expected-arff-schema-2.json b/community/mahout-mr/integration/src/test/resources/expected-arff-schema-2.json
new file mode 100644
index 0000000..b73f55c
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/expected-arff-schema-2.json
@@ -0,0 +1 @@
+[{"values":["rain","overcast","sunny"],"label":"false","attribute":"outlook","type":"categorical"},{"label":"false","attribute":"temperature","type":"numerical"},{"label":"false","attribute":"humidity","type":"numerical"},{"values":["false","true"],"label":"false","attribute":"windy","type":"categorical"},{"values":["play","dont_play"],"label":"true","attribute":"class","type":"categorical"}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/expected-arff-schema.json
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/expected-arff-schema.json b/community/mahout-mr/integration/src/test/resources/expected-arff-schema.json
new file mode 100644
index 0000000..36e0c89
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/expected-arff-schema.json
@@ -0,0 +1 @@
+[{"values":["sunny","overcast","rain"],"attribute":"outlook","label":"false","type":"categorical"},{"attribute":"temperature","label":"false","type":"numerical"},{"attribute":"humidity","label":"false","type":"numerical"},{"values":["true","false"],"attribute":"windy","label":"false","type":"categorical"},{"values":["play","dont_play"],"attribute":"class","label":"true","type":"categorical"}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/non-numeric-1.arff
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/non-numeric-1.arff b/community/mahout-mr/integration/src/test/resources/non-numeric-1.arff
new file mode 100644
index 0000000..bf0c746
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/non-numeric-1.arff
@@ -0,0 +1,24 @@
+ % Comments
+ %
+ % Comments go here %
+ @RELATION Mahout
+
+ @ATTRIBUTE junk NUMERIC
+ @ATTRIBUTE foo NUMERIC
+ @ATTRIBUTE bar {c,d,'xy, numeric','marc o\'polo', e}
+ @ATTRIBUTE hockey string
+ @ATTRIBUTE football date "yyyy-MM-dd"
+
+
+
+ @DATA
+ {2 c,3 gretzky,4 1973-10-23}
+ {1 2.9,2 d,3 orr,4 1973-11-23}
+ {2 c,3 bossy,4 1981-10-23}
+ {1 2.6,2 c,3 lefleur,4 1989-10-23}
+ {3 esposito,4 1973-04-23}
+ {1 23.2,2 d,3 chelios,4 1999-2-23}
+ {3 richard,4 1973-10-12}
+ {3 howe,4 1983-06-23}
+ {0 2.2,2 d,3 messier,4 2008-11-23}
+ {2 c,3 roy,4 1973-10-13}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/non-numeric-2.arff
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/non-numeric-2.arff b/community/mahout-mr/integration/src/test/resources/non-numeric-2.arff
new file mode 100644
index 0000000..6df35b5
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/non-numeric-2.arff
@@ -0,0 +1,24 @@
+ % Comments
+ %
+ % Comments go here %
+ @RELATION Mahout
+
+ @ATTRIBUTE junk NUMERIC
+ @ATTRIBUTE foo NUMERIC
+ @ATTRIBUTE test {f,z}
+ @ATTRIBUTE hockey string
+ @ATTRIBUTE football date "yyyy-MM-dd"
+
+
+
+ @DATA
+ {2 f,3 gretzky,4 1973-10-23}
+ {1 2.9,2 z,3 orr,4 1973-11-23}
+ {2 f,3 bossy,4 1981-10-23}
+ {1 2.6,2 f,3 lefleur,4 1989-10-23}
+ {3 esposito,4 1973-04-23}
+ {1 23.2,2 z,3 chelios,4 1999-2-23}
+ {3 richard,4 1973-10-12}
+ {3 howe,4 1983-06-23}
+ {0 2.2,2 f,3 messier,4 2008-11-23}
+ {2 f,3 roy,4 1973-10-13}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/quoted-id.arff
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/quoted-id.arff b/community/mahout-mr/integration/src/test/resources/quoted-id.arff
new file mode 100644
index 0000000..1f724ed
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/quoted-id.arff
@@ -0,0 +1,9 @@
+@RELATION 'quotes'
+@ATTRIBUTE 'theNumeric' NUMERIC
+@ATTRIBUTE "theInteger" INTEGER
+@ATTRIBUTE theReal REAL
+@ATTRIBUTE theNominal {"double-quote", 'single-quote', no-quote}
+@DATA
+1.0,2,3.0,"no-quote"
+4.0,5,6.0,single-quote
+7.0,8,9.0,'double-quote'

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/sample-dense.arff
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/sample-dense.arff b/community/mahout-mr/integration/src/test/resources/sample-dense.arff
new file mode 100644
index 0000000..dbf5dd2
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/sample-dense.arff
@@ -0,0 +1,20 @@
+ % Comments
+ %
+ % Comments go here %
+ @RELATION golf
+
+ @ATTRIBUTE outlook {sunny,overcast, rain}
+ @ATTRIBUTE temperature NUMERIC
+ @ATTRIBUTE humidity NUMERIC
+ @ATTRIBUTE windy {false, true}
+ @ATTRIBUTE class {dont_play, play}
+
+
+
+ @DATA
+ sunny, 65, ?, false, dont_play, {2}
+ sunny, 80, 90, true, dont_play
+ overcast, 83, 78, false, play ,{3}
+ rain, 70, 96, false, play
+ rain, 68, 80, false, play
+ rain, 65, 70, true, play

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/sample-sparse.arff
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/sample-sparse.arff b/community/mahout-mr/integration/src/test/resources/sample-sparse.arff
new file mode 100644
index 0000000..25e1f9c
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/sample-sparse.arff
@@ -0,0 +1,24 @@
+ % Comments
+ %
+ % Comments go here %
+ @RELATION Mahout
+
+ @ATTRIBUTE foo NUMERIC
+ @ATTRIBUTE bar NUMERIC
+ @ATTRIBUTE hockey NUMERIC
+ @ATTRIBUTE football NUMERIC
+ @ATTRIBUTE tennis NUMERIC
+
+
+
+ @DATA
+ {1 23.1,2 3.23,3 1.2,4 ?} {5}
+ {0 2.9}
+ {0 2.7,2 3.2,3 1.3,4 0.2} {10}
+ {1 2.6,2 3.1,3 1.23,4 0.2}
+ {1 23.0,2 3.6,3 1.2,4 0.2}
+ {0 23.2,1 3.9,3 1.7,4 0.2}
+ {0 2.6,1 3.2,2 1.2,4 0.3}
+ {1 23.0,2 3.2,3 1.23}
+ {1 2.2,2 2.94,3 0.2}
+ {1 2.9,2 3.1}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/sample.arff
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/sample.arff b/community/mahout-mr/integration/src/test/resources/sample.arff
new file mode 100644
index 0000000..cd04b32
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/sample.arff
@@ -0,0 +1,11 @@
+%comments
+@RELATION Mahout
+@ATTRIBUTE foo numeric
+@ATTRIBUTE bar numeric
+@ATTRIBUTE timestamp DATE "yyyy-MM-dd HH:mm:ss"
+@ATTRIBUTE junk string
+@ATTRIBUTE theNominal {c,b,a}
+@DATA
+1,2, "2009-01-01 5:55:55", foo, c
+2,3
+{0 5,1 23}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/resources/test.mbox
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/resources/test.mbox b/community/mahout-mr/integration/src/test/resources/test.mbox
new file mode 100644
index 0000000..99017c0
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/resources/test.mbox
@@ -0,0 +1,1038 @@
+From dev-return-102527-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 01 21:01:35 2010
+Return-Path: <dev-return-102527-apmail-cocoon-dev-archive=***@cocoon.apache.org>
+Delivered-To: apmail-cocoon-dev-***@www.apache.org
+Received: (qmail 34434 invoked from network); 1 Sep 2010 21:01:34 -0000
+Received: from unknown (HELO mail.apache.org) (140.211.11.3)
+ by 140.211.11.9 with SMTP; 1 Sep 2010 21:01:34 -0000
+Received: (qmail 26895 invoked by uid 500); 1 Sep 2010 21:01:34 -0000
+Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
+Received: (qmail 26771 invoked by uid 500); 1 Sep 2010 21:01:33 -0000
+Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
+Precedence: bulk
+list-help: <mailto:dev-***@cocoon.apache.org>
+list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
+List-Post: <mailto:***@cocoon.apache.org>
+Reply-To: ***@cocoon.apache.org
+List-Id: <dev.cocoon.apache.org>
+Delivered-To: mailing list ***@cocoon.apache.org
+Received: (qmail 26764 invoked by uid 99); 1 Sep 2010 21:01:33 -0000
+Received: from Unknown (HELO nike.apache.org) (192.87.106.230)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:01:33 +0000
+X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
+ tests=ALL_TRUSTED
+X-Spam-Check-By: apache.org
+Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:01:16 +0000
+Received: from thor (localhost [127.0.0.1])
+ by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o81L0sNK020435
+ for <***@cocoon.apache.org>; Wed, 1 Sep 2010 21:00:54 GMT
+Message-ID: <***@thor>
+Date: Wed, 1 Sep 2010 17:00:54 -0400 (EDT)
+From: "Douglas Hurbon (JIRA)" <***@apache.org>
+To: ***@cocoon.apache.org
+Subject: [jira] Created: (COCOON-2300) jboss-5.1.0.GA vfszip protocol in
+ CharsetFactory
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+jboss-5.1.0.GA vfszip protocol in CharsetFactory
+------------------------------------------------
+
+ Key: COCOON-2300
+ URL: https://issues.apache.org/jira/browse/COCOON-2300
+ Project: Cocoon
+ Issue Type: Bug
+ Components: Blocks: Serializers
+ Affects Versions: 2.1.11
+ Reporter: Douglas Hurbon
+ Fix For: 2.1.12-dev (Current SVN)
+
+
+Cocoon fails to initialize on Jboss 5.1 due to the new vfszip protocol it uses for class loading. CharsetFactory expects either jar:/ or file:/
+
+Parsing the vfszip protocol in CharsetFactory solves the problem.
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+
+
+From dev-return-102528-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 01 21:03:16 2010
+Return-Path: <dev-return-102528-apmail-cocoon-dev-archive=***@cocoon.apache.org>
+Delivered-To: apmail-cocoon-dev-***@www.apache.org
+Received: (qmail 34824 invoked from network); 1 Sep 2010 21:03:16 -0000
+Received: from unknown (HELO mail.apache.org) (140.211.11.3)
+ by 140.211.11.9 with SMTP; 1 Sep 2010 21:03:16 -0000
+Received: (qmail 29126 invoked by uid 500); 1 Sep 2010 21:03:16 -0000
+Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
+Received: (qmail 29044 invoked by uid 500); 1 Sep 2010 21:03:15 -0000
+Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
+Precedence: bulk
+list-help: <mailto:dev-***@cocoon.apache.org>
+list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
+List-Post: <mailto:***@cocoon.apache.org>
+Reply-To: ***@cocoon.apache.org
+List-Id: <dev.cocoon.apache.org>
+Delivered-To: mailing list ***@cocoon.apache.org
+Received: (qmail 28904 invoked by uid 99); 1 Sep 2010 21:03:15 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:03:15 +0000
+X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
+ tests=ALL_TRUSTED
+X-Spam-Check-By: apache.org
+Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Sep 2010 21:03:14 +0000
+Received: from thor (localhost [127.0.0.1])
+ by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o81L2sFQ020591
+ for <***@cocoon.apache.org>; Wed, 1 Sep 2010 21:02:54 GMT
+Message-ID: <***@thor>
+Date: Wed, 1 Sep 2010 17:02:54 -0400 (EDT)
+From: "Douglas Hurbon (JIRA)" <***@apache.org>
+To: ***@cocoon.apache.org
+Subject: [jira] Updated: (COCOON-2300) jboss-5.1.0.GA vfszip protocol in
+ CharsetFactory
+In-Reply-To: <***@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+
+
+ [ https://issues.apache.org/jira/browse/COCOON-2300?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
+
+Douglas Hurbon updated COCOON-2300:
+-----------------------------------
+
+ Attachment: CharsetFactory.patch
+
+Patch for the CharsetFactory running on Jboss 5.1.
+
+> jboss-5.1.0.GA vfszip protocol in CharsetFactory
+> ------------------------------------------------
+>
+> Key: COCOON-2300
+> URL: https://issues.apache.org/jira/browse/COCOON-2300
+> Project: Cocoon
+> Issue Type: Bug
+> Components: Blocks: Serializers
+> Affects Versions: 2.1.11
+> Reporter: Douglas Hurbon
+> Fix For: 2.1.12-dev (Current SVN)
+>
+> Attachments: CharsetFactory.patch
+>
+>
+> Cocoon fails to initialize on Jboss 5.1 due to the new vfszip protocol it uses for class loading. CharsetFactory expects either jar:/ or file:/
+> Parsing the vfszip protocol in CharsetFactory solves the problem.
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+
+
+From dev-return-102529-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 08 14:41:10 2010
+Return-Path: <dev-return-102529-apmail-cocoon-dev-archive=***@cocoon.apache.org>
+Delivered-To: apmail-cocoon-dev-***@www.apache.org
+Received: (qmail 13040 invoked from network); 8 Sep 2010 14:41:09 -0000
+Received: from unknown (HELO mail.apache.org) (140.211.11.3)
+ by 140.211.11.9 with SMTP; 8 Sep 2010 14:41:09 -0000
+Received: (qmail 76345 invoked by uid 500); 8 Sep 2010 14:41:09 -0000
+Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
+Received: (qmail 75377 invoked by uid 500); 8 Sep 2010 14:41:05 -0000
+Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
+Precedence: bulk
+list-help: <mailto:dev-***@cocoon.apache.org>
+list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
+List-Post: <mailto:***@cocoon.apache.org>
+Reply-To: ***@cocoon.apache.org
+List-Id: <dev.cocoon.apache.org>
+Delivered-To: mailing list ***@cocoon.apache.org
+Received: (qmail 75370 invoked by uid 99); 8 Sep 2010 14:41:03 -0000
+Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Sep 2010 14:41:03 +0000
+X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
+ tests=ALL_TRUSTED
+X-Spam-Check-By: apache.org
+Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Sep 2010 14:40:59 +0000
+Received: from thor (localhost [127.0.0.1])
+ by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o88EebFT004291
+ for <***@cocoon.apache.org>; Wed, 8 Sep 2010 14:40:38 GMT
+Message-ID: <***@thor>
+Date: Wed, 8 Sep 2010 10:40:37 -0400 (EDT)
+From: ***@apache.org
+To: ***@cocoon.apache.org
+Subject: [jira] Subscription: COCOON-open-with-patch
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+Issue Subscription
+Filter: COCOON-open-with-patch (114 issues)
+Subscriber: cocoon
+
+Key Summary
+COCOON-2300 jboss-5.1.0.GA vfszip protocol in CharsetFactory
+ https://issues.apache.org/jira/browse/COCOON-2300
+COCOON-2298 IncludeTransformer does not handle multi-valued parameters
+ https://issues.apache.org/jira/browse/COCOON-2298
+COCOON-2297 Character encoding does not follow JTidy properties
+ https://issues.apache.org/jira/browse/COCOON-2297
+COCOON-2296 [PATCH] Make flowscript work with Commons JXPath 1.3
+ https://issues.apache.org/jira/browse/COCOON-2296
+COCOON-2295 integrating FOP-1.0 into Cocoon-2.1.12-dev
+ https://issues.apache.org/jira/browse/COCOON-2295
+COCOON-2294 Wrong version number for cocoon-serializers-impl in parent pom for revision 964648
+ https://issues.apache.org/jira/browse/COCOON-2294
+COCOON-2290 CLONE - Add a read method to the SitemapComponentTestCase
+ https://issues.apache.org/jira/browse/COCOON-2290
+COCOON-2288 Allow usage of SLF4J for traces
+ https://issues.apache.org/jira/browse/COCOON-2288
+COCOON-2281 "Communication tools that we use" link to dev mailing list archive comes out at user mailing list archive
+ https://issues.apache.org/jira/browse/COCOON-2281
+COCOON-2268 To extend the image reader we need to change the visibility to the parameter of the ImageReader
+ https://issues.apache.org/jira/browse/COCOON-2268
+COCOON-2262 container.refresh() is called before embeddedServlet.init()
+ https://issues.apache.org/jira/browse/COCOON-2262
+COCOON-2260 wrong parent version in pom of cocoon-flowscript-impl
+ https://issues.apache.org/jira/browse/COCOON-2260
+COCOON-2249 XHTMLSerializer uses entity references " and ' which cause JavaScript parse errors
+ https://issues.apache.org/jira/browse/COCOON-2249
+COCOON-2246 HttpRequest should handle encoding in getParameter and getParameterValues in the same way
+ https://issues.apache.org/jira/browse/COCOON-2246
+COCOON-2233 Update archetypes to current trunk artifact versions
+ https://issues.apache.org/jira/browse/COCOON-2233
+COCOON-2222 Add SaxParser configuration properties
+ https://issues.apache.org/jira/browse/COCOON-2222
+COCOON-2216 IncludeCacheManager can not perfom parallel includes
+ https://issues.apache.org/jira/browse/COCOON-2216
+COCOON-2212 jx:attribute does not check name is correct before proceeding
+ https://issues.apache.org/jira/browse/COCOON-2212
+COCOON-2197 Making the cocoon-auth-block acegi-security-sample work
+ https://issues.apache.org/jira/browse/COCOON-2197
+COCOON-2173 AbstractCachingProcessingPipeline: Two requests can deadlock each other
+ https://issues.apache.org/jira/browse/COCOON-2173
+COCOON-2162 [PATCH] Fix for Paginator when accessing out of bounds Pagination page
+ https://issues.apache.org/jira/browse/COCOON-2162
+COCOON-2137 XSD Schemas for CForms Development
+ https://issues.apache.org/jira/browse/COCOON-2137
+COCOON-2114 fix sorting in TraversableGenerator
+ https://issues.apache.org/jira/browse/COCOON-2114
+COCOON-2108 xmodule:flow-attr Does not accept document objects
+ https://issues.apache.org/jira/browse/COCOON-2108
+COCOON-2100 Retrieving mimeType returned by pipeline executed from Flow
+ https://issues.apache.org/jira/browse/COCOON-2100
+COCOON-2040 Union widget does not work with booleanfield set as case widget
+ https://issues.apache.org/jira/browse/COCOON-2040
+COCOON-2037 New DynamicGroup widget
+ https://issues.apache.org/jira/browse/COCOON-2037
+COCOON-2032 [PATCH] Sort order in paginated repeater
+ https://issues.apache.org/jira/browse/COCOON-2032
+COCOON-2030 submit-on-change doesn't work for a multivaluefield with list-type="checkbox"
+ https://issues.apache.org/jira/browse/COCOON-2030
+COCOON-2018 Use thread context class loader to load custom binding classes
+ https://issues.apache.org/jira/browse/COCOON-2018
+COCOON-2017 More output beautification options for serializers
+ https://issues.apache.org/jira/browse/COCOON-2017
+COCOON-2015 Doctype added twice because root element (html) is inlined
+ https://issues.apache.org/jira/browse/COCOON-2015
+COCOON-2002 HTML transformer only works with latin-1 characters
+ https://issues.apache.org/jira/browse/COCOON-2002
+COCOON-1974 Donating ContextAttributeInputModule
+ https://issues.apache.org/jira/browse/COCOON-1974
+COCOON-1973 CaptchaValidator: allow case-insensitive matching
+ https://issues.apache.org/jira/browse/COCOON-1973
+COCOON-1964 Redirects inside a block called via the servlet protocol fail
+ https://issues.apache.org/jira/browse/COCOON-1964
+COCOON-1963 Add a redirect action to the browser update handler
+ https://issues.apache.org/jira/browse/COCOON-1963
+COCOON-1960 Pipeline errors for "generator/reader already set" should provide more information
+ https://issues.apache.org/jira/browse/COCOON-1960
+COCOON-1949 [PATCH] load flowscript from file into specified Rhino context object
+ https://issues.apache.org/jira/browse/COCOON-1949
+COCOON-1946 [PATCH] - Javaflow Sample errors trying to enhance Javaflow classes and showing cform templates
+ https://issues.apache.org/jira/browse/COCOON-1946
+COCOON-1943 [Patch] Parameters in blocks-protocol URIs get decoded too early
+ https://issues.apache.org/jira/browse/COCOON-1943
+COCOON-1932 [PATCH] correct styling of disabled suggestion lists
+ https://issues.apache.org/jira/browse/COCOON-1932
+COCOON-1929 [PATCH] Reloading classloader in Cocoon 2.2
+ https://issues.apache.org/jira/browse/COCOON-1929
+COCOON-1917 Request Encoding problem: multipart/form vs. url encoded
+ https://issues.apache.org/jira/browse/COCOON-1917
+COCOON-1915 Nullable value with additional String or XMLizable in JavaSelectionList
+ https://issues.apache.org/jira/browse/COCOON-1915
+COCOON-1914 Text as XMLizable in EmptySelectionList
+ https://issues.apache.org/jira/browse/COCOON-1914
+COCOON-1899 [PATCH] Cocoon XML:DB Implementation should not depend on Xindice
+ https://issues.apache.org/jira/browse/COCOON-1899
+COCOON-1898 [PATCH] XPatch support for maven-cocoon-deployer-plugin
+ https://issues.apache.org/jira/browse/COCOON-1898
+COCOON-1893 XML-Binding: Problem creating a new element
+ https://issues.apache.org/jira/browse/COCOON-1893
+COCOON-1877 [PATCH] Pageable Repeater
+ https://issues.apache.org/jira/browse/COCOON-1877
+COCOON-1870 Lucene block does not store attributes when instructed so
+ https://issues.apache.org/jira/browse/COCOON-1870
+COCOON-1846 [PATCH] BooleanField and radio do not send on-value-changed at the rigth time with IE
+ https://issues.apache.org/jira/browse/COCOON-1846
+COCOON-1843 LDAPTransformer: add-entry tag doesn't work
+ https://issues.apache.org/jira/browse/COCOON-1843
+COCOON-1842 LDAPTransformer: ClassCastException with Binary fields
+ https://issues.apache.org/jira/browse/COCOON-1842
+COCOON-1810 [PATCH] JMSEventMessageListener does not work
+ https://issues.apache.org/jira/browse/COCOON-1810
+COCOON-1807 Workaround for IE Bug in <button>
+ https://issues.apache.org/jira/browse/COCOON-1807
+COCOON-1794 [PATCH] Propagation of namespaces to a repeaters child bindings and implementation of a move-node binding
+ https://issues.apache.org/jira/browse/COCOON-1794
+COCOON-1738 double-listbox problem in repeaters
+ https://issues.apache.org/jira/browse/COCOON-1738
+COCOON-1726 Implementation of Source that supports conditional GETs
+ https://issues.apache.org/jira/browse/COCOON-1726
+COCOON-1717 Use custom cache keys for caching uri coplets using input modules.
+ https://issues.apache.org/jira/browse/COCOON-1717
+COCOON-1697 Allow request parameters to be used in "for (var k in h)" kind of Javascript Loops
+ https://issues.apache.org/jira/browse/COCOON-1697
+COCOON-1648 Add support for ISO8601 in I18nTransformer and Forms
+ https://issues.apache.org/jira/browse/COCOON-1648
+COCOON-1618 [PATCH] SoapGenerator/Serializer for Axis Block
+ https://issues.apache.org/jira/browse/COCOON-1618
+COCOON-1611 [PATCH] Add additonal constructor to FormInstance.java to be able to pass a locale
+ https://issues.apache.org/jira/browse/COCOON-1611
+COCOON-1603 [PATCH] handling of alternatives in MailTransformer
+ https://issues.apache.org/jira/browse/COCOON-1603
+COCOON-1573 Improvement SetAttributeJXPathBinding and Contribution SetNodeValueJXPathBinding
+ https://issues.apache.org/jira/browse/COCOON-1573
+COCOON-1556 [PATCH] Add a JXPathConvertor for conversion betwean beans and Strings
+ https://issues.apache.org/jira/browse/COCOON-1556
+COCOON-1535 [PATCH] enhancement to {global:} input module: return all sitemap globals
+ https://issues.apache.org/jira/browse/COCOON-1535
+COCOON-1527 [PATCH] Cache control logic sheets for XSP to override getKey and getValidity
+ https://issues.apache.org/jira/browse/COCOON-1527
+COCOON-1526 [PATCH] processToDOM returns a read-only DOM
+ https://issues.apache.org/jira/browse/COCOON-1526
+COCOON-1519 [PATCH] TeeTransformer refactoring
+ https://issues.apache.org/jira/browse/COCOON-1519
+COCOON-1508 [PATCH] Avalonize TranscoderFactory
+ https://issues.apache.org/jira/browse/COCOON-1508
+COCOON-1506 [PATCH] Manually specifying a mounted sitemap's context
+ https://issues.apache.org/jira/browse/COCOON-1506
+COCOON-1488 [PATCH] htmlunit-based testing, needs to be ported to 2.2
+ https://issues.apache.org/jira/browse/COCOON-1488
+COCOON-1467 ESQL exception handling problem
+ https://issues.apache.org/jira/browse/COCOON-1467
+COCOON-1439 [poi] vertical text orientation and font cache
+ https://issues.apache.org/jira/browse/COCOON-1439
+COCOON-1398 New CachingPortletAdapter
+ https://issues.apache.org/jira/browse/COCOON-1398
+COCOON-1395 [PATCH] Missing ContextAttributeInputModule
+ https://issues.apache.org/jira/browse/COCOON-1395
+COCOON-1394 [PATCH] Implementation of PortletRequest#getQueryString()
+ https://issues.apache.org/jira/browse/COCOON-1394
+COCOON-1384 [PATCH] flow redirector should allow explicit 'cocoon:' scheme
+ https://issues.apache.org/jira/browse/COCOON-1384
+COCOON-1370 [PATCH] proxy block can now use JTidy and handle multipart POST
+ https://issues.apache.org/jira/browse/COCOON-1370
+COCOON-1368 [PATCH] HTTPRequestTransformer
+ https://issues.apache.org/jira/browse/COCOON-1368
+COCOON-1362 [PATCH] log4j.xconf should have the same default config as logkit.xconf
+ https://issues.apache.org/jira/browse/COCOON-1362
+COCOON-1360 [patch] client side validation for CForms
+ https://issues.apache.org/jira/browse/COCOON-1360
+COCOON-1345 [PATCH] Extract convertors into their own block
+ https://issues.apache.org/jira/browse/COCOON-1345
+COCOON-1340 [PATCH] lucene block contribution : a AnalyzerManager component
+ https://issues.apache.org/jira/browse/COCOON-1340
+COCOON-1337 [PATCH] Suggestion for widget population
+ https://issues.apache.org/jira/browse/COCOON-1337
+COCOON-1336 [PATCH] PortletWindowAspect: hiding portlet mode icons and new feature "force-sizable"
+ https://issues.apache.org/jira/browse/COCOON-1336
+COCOON-1332 [PATCH] content-length and content-type for portlet ActionRequest
+ https://issues.apache.org/jira/browse/COCOON-1332
+COCOON-1329 [PATCH] Fix for cocoon.jar bundled in ear common for portal.war and portlet.war
+ https://issues.apache.org/jira/browse/COCOON-1329
+COCOON-1325 [PATCH] commons-fileupload based multipart parser
+ https://issues.apache.org/jira/browse/COCOON-1325
+COCOON-1302 [Patch] Word Document Generator
+ https://issues.apache.org/jira/browse/COCOON-1302
+COCOON-1295 ParallelContentAggregator, multithreaded aggregating
+ https://issues.apache.org/jira/browse/COCOON-1295
+COCOON-1260 [PATCH] MultipartParser can now handle multipart/mixed
+ https://issues.apache.org/jira/browse/COCOON-1260
+COCOON-1254 [Patch] OWQLTransformer + RDQLTransformer
+ https://issues.apache.org/jira/browse/COCOON-1254
+COCOON-1249 [Patch] XMLDBSource should accept scheme://user:***@host:port/path URIs
+ https://issues.apache.org/jira/browse/COCOON-1249
+COCOON-1232 [PATCH] NEW--ModuleDB Action for ORACLE( auto. increment )
+ https://issues.apache.org/jira/browse/COCOON-1232
+COCOON-1203 [PATCH] inserver junit testing
+ https://issues.apache.org/jira/browse/COCOON-1203
+COCOON-1200 [PATCH] XML CSS engine
+ https://issues.apache.org/jira/browse/COCOON-1200
+COCOON-1185 [PATCH] BerkeleyDBStore
+ https://issues.apache.org/jira/browse/COCOON-1185
+COCOON-1147 [PATCH] namespace issues with XMLDBTransformer
+ https://issues.apache.org/jira/browse/COCOON-1147
+COCOON-1125 [PATCH] Updated CastorTransformer + samples
+ https://issues.apache.org/jira/browse/COCOON-1125
+COCOON-1027 [PATCH] CocoonBean add additional features for reprocessing pipelines and interrupt processing
+ https://issues.apache.org/jira/browse/COCOON-1027
+COCOON-996 [PATCH] LuceneIndexContentHandler.java produces CLOBs
+ https://issues.apache.org/jira/browse/COCOON-996
+COCOON-988 [PATCH] StreamGenerator can't handle multipart request parameters correctly
+ https://issues.apache.org/jira/browse/COCOON-988
+COCOON-881 [PATCH] file upload component for usage with flowscript
+ https://issues.apache.org/jira/browse/COCOON-881
+COCOON-871 [PATCH] XML posting from SourceWritingTransformer by using an enhanced HTTPClientSource
+ https://issues.apache.org/jira/browse/COCOON-871
+COCOON-867 [PATCH] wsinclude and htmlinclude transformers
+ https://issues.apache.org/jira/browse/COCOON-867
+COCOON-865 [PATCH] New ResourceLoadAction
+ https://issues.apache.org/jira/browse/COCOON-865
+COCOON-844 [PATCH] adding <wd:on-phase> and moving load() and save() to Form.
+ https://issues.apache.org/jira/browse/COCOON-844
+COCOON-825 [PATCH] Fix Bug: Better handling of CLOB in esql (get-xml) and handling of Oracle 'temporary lobs'
+ https://issues.apache.org/jira/browse/COCOON-825
+COCOON-719 [PATCH] Support for transactions in SQLTransformer
+ https://issues.apache.org/jira/browse/COCOON-719
+COCOON-717 [PATCH] Namespace cleanup in HTMLSerializer
+ https://issues.apache.org/jira/browse/COCOON-717
+COCOON-665 [PATCH] HSSFSerializer Support for FreezePane
+ https://issues.apache.org/jira/browse/COCOON-665
+
+You may edit this subscription at:
+https://issues.apache.org/jira/secure/FilterSubscription!default.jspa?subId=10311&filterId=12310771
+
+
+From dev-return-102530-apmail-cocoon-dev-archive=***@cocoon.apache.org Thu Sep 09 21:09:56 2010
+Return-Path: <dev-return-102530-apmail-cocoon-dev-archive=***@cocoon.apache.org>
+Delivered-To: apmail-cocoon-dev-***@www.apache.org
+Received: (qmail 92717 invoked from network); 9 Sep 2010 21:09:55 -0000
+Received: from unknown (HELO mail.apache.org) (140.211.11.3)
+ by 140.211.11.9 with SMTP; 9 Sep 2010 21:09:55 -0000
+Received: (qmail 28372 invoked by uid 500); 9 Sep 2010 21:09:55 -0000
+Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
+Received: (qmail 28206 invoked by uid 500); 9 Sep 2010 21:09:54 -0000
+Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
+Precedence: bulk
+list-help: <mailto:dev-***@cocoon.apache.org>
+list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
+List-Post: <mailto:***@cocoon.apache.org>
+Reply-To: ***@cocoon.apache.org
+List-Id: <dev.cocoon.apache.org>
+Delivered-To: mailing list ***@cocoon.apache.org
+Received: (qmail 28199 invoked by uid 99); 9 Sep 2010 21:09:53 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:09:53 +0000
+X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
+ tests=ALL_TRUSTED
+X-Spam-Check-By: apache.org
+Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:09:53 +0000
+Received: from thor (localhost [127.0.0.1])
+ by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o89L9WIT025382
+ for <***@cocoon.apache.org>; Thu, 9 Sep 2010 21:09:33 GMT
+Message-ID: <***@thor>
+Date: Thu, 9 Sep 2010 17:09:32 -0400 (EDT)
+From: "Douglas Hurbon (JIRA)" <***@apache.org>
+To: ***@cocoon.apache.org
+Subject: [jira] Created: (COCOON-2301) Cocoon Cron Block Configurable
+ Clustering
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+
+Cocoon Cron Block Configurable Clustering
+-----------------------------------------
+
+ Key: COCOON-2301
+ URL: https://issues.apache.org/jira/browse/COCOON-2301
+ Project: Cocoon
+ Issue Type: Improvement
+ Components: Blocks: Cron
+ Affects Versions: 2.1.11
+ Reporter: Douglas Hurbon
+
+
+The QuartzJobScheduler is modified to respond to a configuration parameter: clustered=true so that it can correctly use a clustered job store when using cocoon in a cluster.
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+
+
+From dev-return-102531-apmail-cocoon-dev-archive=***@cocoon.apache.org Thu Sep 09 21:12:00 2010
+Return-Path: <dev-return-102531-apmail-cocoon-dev-archive=***@cocoon.apache.org>
+Delivered-To: apmail-cocoon-dev-***@www.apache.org
+Received: (qmail 94093 invoked from network); 9 Sep 2010 21:12:00 -0000
+Received: from unknown (HELO mail.apache.org) (140.211.11.3)
+ by 140.211.11.9 with SMTP; 9 Sep 2010 21:12:00 -0000
+Received: (qmail 32222 invoked by uid 500); 9 Sep 2010 21:11:59 -0000
+Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
+Received: (qmail 31836 invoked by uid 500); 9 Sep 2010 21:11:58 -0000
+Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
+Precedence: bulk
+list-help: <mailto:dev-***@cocoon.apache.org>
+list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
+List-Post: <mailto:***@cocoon.apache.org>
+Reply-To: ***@cocoon.apache.org
+List-Id: <dev.cocoon.apache.org>
+Delivered-To: mailing list ***@cocoon.apache.org
+Received: (qmail 31829 invoked by uid 99); 9 Sep 2010 21:11:58 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:11:58 +0000
+X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
+ tests=ALL_TRUSTED
+X-Spam-Check-By: apache.org
+Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 Sep 2010 21:11:58 +0000
+Received: from thor (localhost [127.0.0.1])
+ by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o89LBcLv025458
+ for <***@cocoon.apache.org>; Thu, 9 Sep 2010 21:11:38 GMT
+Message-ID: <***@thor>
+Date: Thu, 9 Sep 2010 17:11:38 -0400 (EDT)
+From: "Douglas Hurbon (JIRA)" <***@apache.org>
+To: ***@cocoon.apache.org
+Subject: [jira] Updated: (COCOON-2301) Cocoon Cron Block Configurable
+ Clustering
+In-Reply-To: <***@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+
+
+ [ https://issues.apache.org/jira/browse/COCOON-2301?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
+
+Douglas Hurbon updated COCOON-2301:
+-----------------------------------
+
+ Attachment: QuartzJobScheduler.patch
+
+Patch to make cocoon_2_1_x/src/blocks/cron/java/org/apache/cocoon/components/cron/QuartzJobScheduler.java respond correctly to configuration for clustering.
+
+> Cocoon Cron Block Configurable Clustering
+> -----------------------------------------
+>
+> Key: COCOON-2301
+> URL: https://issues.apache.org/jira/browse/COCOON-2301
+> Project: Cocoon
+> Issue Type: Improvement
+> Components: Blocks: Cron
+> Affects Versions: 2.1.11
+> Reporter: Douglas Hurbon
+> Attachments: QuartzJobScheduler.patch
+>
+>
+> The QuartzJobScheduler is modified to respond to a configuration parameter: clustered=true so that it can correctly use a clustered job store when using cocoon in a cluster.
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+
+
+From dev-return-102532-apmail-cocoon-dev-archive=***@cocoon.apache.org Wed Sep 15 14:42:02 2010
+Return-Path: <dev-return-102532-apmail-cocoon-dev-archive=***@cocoon.apache.org>
+Delivered-To: apmail-cocoon-dev-***@www.apache.org
+Received: (qmail 34078 invoked from network); 15 Sep 2010 14:42:01 -0000
+Received: from unknown (HELO mail.apache.org) (140.211.11.3)
+ by 140.211.11.9 with SMTP; 15 Sep 2010 14:42:01 -0000
+Received: (qmail 5328 invoked by uid 500); 15 Sep 2010 14:42:01 -0000
+Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
+Received: (qmail 4960 invoked by uid 500); 15 Sep 2010 14:41:57 -0000
+Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
+Precedence: bulk
+list-help: <mailto:dev-***@cocoon.apache.org>
+list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
+List-Post: <mailto:***@cocoon.apache.org>
+Reply-To: ***@cocoon.apache.org
+List-Id: <dev.cocoon.apache.org>
+Delivered-To: mailing list ***@cocoon.apache.org
+Received: (qmail 4952 invoked by uid 99); 15 Sep 2010 14:41:56 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 15 Sep 2010 14:41:56 +0000
+X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
+ tests=ALL_TRUSTED
+X-Spam-Check-By: apache.org
+Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 15 Sep 2010 14:41:54 +0000
+Received: from thor (localhost [127.0.0.1])
+ by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o8FEfYjF006141
+ for <***@cocoon.apache.org>; Wed, 15 Sep 2010 14:41:34 GMT
+Message-ID: <***@thor>
+Date: Wed, 15 Sep 2010 10:41:34 -0400 (EDT)
+From: ***@apache.org
+To: ***@cocoon.apache.org
+Subject: [jira] Subscription: COCOON-open-with-patch
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+
+Issue Subscription
+Filter: COCOON-open-with-patch (115 issues)
+Subscriber: cocoon
+
+Key Summary
+COCOON-2301 Cocoon Cron Block Configurable Clustering
+ https://issues.apache.org/jira/browse/COCOON-2301
+COCOON-2300 jboss-5.1.0.GA vfszip protocol in CharsetFactory
+ https://issues.apache.org/jira/browse/COCOON-2300
+COCOON-2298 IncludeTransformer does not handle multi-valued parameters
+ https://issues.apache.org/jira/browse/COCOON-2298
+COCOON-2297 Character encoding does not follow JTidy properties
+ https://issues.apache.org/jira/browse/COCOON-2297
+COCOON-2296 [PATCH] Make flowscript work with Commons JXPath 1.3
+ https://issues.apache.org/jira/browse/COCOON-2296
+COCOON-2295 integrating FOP-1.0 into Cocoon-2.1.12-dev
+ https://issues.apache.org/jira/browse/COCOON-2295
+COCOON-2294 Wrong version number for cocoon-serializers-impl in parent pom for revision 964648
+ https://issues.apache.org/jira/browse/COCOON-2294
+COCOON-2290 CLONE - Add a read method to the SitemapComponentTestCase
+ https://issues.apache.org/jira/browse/COCOON-2290
+COCOON-2288 Allow usage of SLF4J for traces
+ https://issues.apache.org/jira/browse/COCOON-2288
+COCOON-2281 "Communication tools that we use" link to dev mailing list archive comes out at user mailing list archive
+ https://issues.apache.org/jira/browse/COCOON-2281
+COCOON-2268 To extend the image reader we need to change the visibility to the parameter of the ImageReader
+ https://issues.apache.org/jira/browse/COCOON-2268
+COCOON-2262 container.refresh() is called before embeddedServlet.init()
+ https://issues.apache.org/jira/browse/COCOON-2262
+COCOON-2260 wrong parent version in pom of cocoon-flowscript-impl
+ https://issues.apache.org/jira/browse/COCOON-2260
+COCOON-2249 XHTMLSerializer uses entity references " and ' which cause JavaScript parse errors
+ https://issues.apache.org/jira/browse/COCOON-2249
+COCOON-2246 HttpRequest should handle encoding in getParameter and getParameterValues in the same way
+ https://issues.apache.org/jira/browse/COCOON-2246
+COCOON-2233 Update archetypes to current trunk artifact versions
+ https://issues.apache.org/jira/browse/COCOON-2233
+COCOON-2222 Add SaxParser configuration properties
+ https://issues.apache.org/jira/browse/COCOON-2222
+COCOON-2216 IncludeCacheManager can not perfom parallel includes
+ https://issues.apache.org/jira/browse/COCOON-2216
+COCOON-2212 jx:attribute does not check name is correct before proceeding
+ https://issues.apache.org/jira/browse/COCOON-2212
+COCOON-2197 Making the cocoon-auth-block acegi-security-sample work
+ https://issues.apache.org/jira/browse/COCOON-2197
+COCOON-2173 AbstractCachingProcessingPipeline: Two requests can deadlock each other
+ https://issues.apache.org/jira/browse/COCOON-2173
+COCOON-2162 [PATCH] Fix for Paginator when accessing out of bounds Pagination page
+ https://issues.apache.org/jira/browse/COCOON-2162
+COCOON-2137 XSD Schemas for CForms Development
+ https://issues.apache.org/jira/browse/COCOON-2137
+COCOON-2114 fix sorting in TraversableGenerator
+ https://issues.apache.org/jira/browse/COCOON-2114
+COCOON-2108 xmodule:flow-attr Does not accept document objects
+ https://issues.apache.org/jira/browse/COCOON-2108
+COCOON-2100 Retrieving mimeType returned by pipeline executed from Flow
+ https://issues.apache.org/jira/browse/COCOON-2100
+COCOON-2040 Union widget does not work with booleanfield set as case widget
+ https://issues.apache.org/jira/browse/COCOON-2040
+COCOON-2037 New DynamicGroup widget
+ https://issues.apache.org/jira/browse/COCOON-2037
+COCOON-2032 [PATCH] Sort order in paginated repeater
+ https://issues.apache.org/jira/browse/COCOON-2032
+COCOON-2030 submit-on-change doesn't work for a multivaluefield with list-type="checkbox"
+ https://issues.apache.org/jira/browse/COCOON-2030
+COCOON-2018 Use thread context class loader to load custom binding classes
+ https://issues.apache.org/jira/browse/COCOON-2018
+COCOON-2017 More output beautification options for serializers
+ https://issues.apache.org/jira/browse/COCOON-2017
+COCOON-2015 Doctype added twice because root element (html) is inlined
+ https://issues.apache.org/jira/browse/COCOON-2015
+COCOON-2002 HTML transformer only works with latin-1 characters
+ https://issues.apache.org/jira/browse/COCOON-2002
+COCOON-1974 Donating ContextAttributeInputModule
+ https://issues.apache.org/jira/browse/COCOON-1974
+COCOON-1973 CaptchaValidator: allow case-insensitive matching
+ https://issues.apache.org/jira/browse/COCOON-1973
+COCOON-1964 Redirects inside a block called via the servlet protocol fail
+ https://issues.apache.org/jira/browse/COCOON-1964
+COCOON-1963 Add a redirect action to the browser update handler
+ https://issues.apache.org/jira/browse/COCOON-1963
+COCOON-1960 Pipeline errors for "generator/reader already set" should provide more information
+ https://issues.apache.org/jira/browse/COCOON-1960
+COCOON-1949 [PATCH] load flowscript from file into specified Rhino context object
+ https://issues.apache.org/jira/browse/COCOON-1949
+COCOON-1946 [PATCH] - Javaflow Sample errors trying to enhance Javaflow classes and showing cform templates
+ https://issues.apache.org/jira/browse/COCOON-1946
+COCOON-1943 [Patch] Parameters in blocks-protocol URIs get decoded too early
+ https://issues.apache.org/jira/browse/COCOON-1943
+COCOON-1932 [PATCH] correct styling of disabled suggestion lists
+ https://issues.apache.org/jira/browse/COCOON-1932
+COCOON-1929 [PATCH] Reloading classloader in Cocoon 2.2
+ https://issues.apache.org/jira/browse/COCOON-1929
+COCOON-1917 Request Encoding problem: multipart/form vs. url encoded
+ https://issues.apache.org/jira/browse/COCOON-1917
+COCOON-1915 Nullable value with additional String or XMLizable in JavaSelectionList
+ https://issues.apache.org/jira/browse/COCOON-1915
+COCOON-1914 Text as XMLizable in EmptySelectionList
+ https://issues.apache.org/jira/browse/COCOON-1914
+COCOON-1899 [PATCH] Cocoon XML:DB Implementation should not depend on Xindice
+ https://issues.apache.org/jira/browse/COCOON-1899
+COCOON-1898 [PATCH] XPatch support for maven-cocoon-deployer-plugin
+ https://issues.apache.org/jira/browse/COCOON-1898
+COCOON-1893 XML-Binding: Problem creating a new element
+ https://issues.apache.org/jira/browse/COCOON-1893
+COCOON-1877 [PATCH] Pageable Repeater
+ https://issues.apache.org/jira/browse/COCOON-1877
+COCOON-1870 Lucene block does not store attributes when instructed so
+ https://issues.apache.org/jira/browse/COCOON-1870
+COCOON-1846 [PATCH] BooleanField and radio do not send on-value-changed at the rigth time with IE
+ https://issues.apache.org/jira/browse/COCOON-1846
+COCOON-1843 LDAPTransformer: add-entry tag doesn't work
+ https://issues.apache.org/jira/browse/COCOON-1843
+COCOON-1842 LDAPTransformer: ClassCastException with Binary fields
+ https://issues.apache.org/jira/browse/COCOON-1842
+COCOON-1810 [PATCH] JMSEventMessageListener does not work
+ https://issues.apache.org/jira/browse/COCOON-1810
+COCOON-1807 Workaround for IE Bug in <button>
+ https://issues.apache.org/jira/browse/COCOON-1807
+COCOON-1794 [PATCH] Propagation of namespaces to a repeaters child bindings and implementation of a move-node binding
+ https://issues.apache.org/jira/browse/COCOON-1794
+COCOON-1738 double-listbox problem in repeaters
+ https://issues.apache.org/jira/browse/COCOON-1738
+COCOON-1726 Implementation of Source that supports conditional GETs
+ https://issues.apache.org/jira/browse/COCOON-1726
+COCOON-1717 Use custom cache keys for caching uri coplets using input modules.
+ https://issues.apache.org/jira/browse/COCOON-1717
+COCOON-1697 Allow request parameters to be used in "for (var k in h)" kind of Javascript Loops
+ https://issues.apache.org/jira/browse/COCOON-1697
+COCOON-1648 Add support for ISO8601 in I18nTransformer and Forms
+ https://issues.apache.org/jira/browse/COCOON-1648
+COCOON-1618 [PATCH] SoapGenerator/Serializer for Axis Block
+ https://issues.apache.org/jira/browse/COCOON-1618
+COCOON-1611 [PATCH] Add additonal constructor to FormInstance.java to be able to pass a locale
+ https://issues.apache.org/jira/browse/COCOON-1611
+COCOON-1603 [PATCH] handling of alternatives in MailTransformer
+ https://issues.apache.org/jira/browse/COCOON-1603
+COCOON-1573 Improvement SetAttributeJXPathBinding and Contribution SetNodeValueJXPathBinding
+ https://issues.apache.org/jira/browse/COCOON-1573
+COCOON-1556 [PATCH] Add a JXPathConvertor for conversion betwean beans and Strings
+ https://issues.apache.org/jira/browse/COCOON-1556
+COCOON-1535 [PATCH] enhancement to {global:} input module: return all sitemap globals
+ https://issues.apache.org/jira/browse/COCOON-1535
+COCOON-1527 [PATCH] Cache control logic sheets for XSP to override getKey and getValidity
+ https://issues.apache.org/jira/browse/COCOON-1527
+COCOON-1526 [PATCH] processToDOM returns a read-only DOM
+ https://issues.apache.org/jira/browse/COCOON-1526
+COCOON-1519 [PATCH] TeeTransformer refactoring
+ https://issues.apache.org/jira/browse/COCOON-1519
+COCOON-1508 [PATCH] Avalonize TranscoderFactory
+ https://issues.apache.org/jira/browse/COCOON-1508
+COCOON-1506 [PATCH] Manually specifying a mounted sitemap's context
+ https://issues.apache.org/jira/browse/COCOON-1506
+COCOON-1488 [PATCH] htmlunit-based testing, needs to be ported to 2.2
+ https://issues.apache.org/jira/browse/COCOON-1488
+COCOON-1467 ESQL exception handling problem
+ https://issues.apache.org/jira/browse/COCOON-1467
+COCOON-1439 [poi] vertical text orientation and font cache
+ https://issues.apache.org/jira/browse/COCOON-1439
+COCOON-1398 New CachingPortletAdapter
+ https://issues.apache.org/jira/browse/COCOON-1398
+COCOON-1395 [PATCH] Missing ContextAttributeInputModule
+ https://issues.apache.org/jira/browse/COCOON-1395
+COCOON-1394 [PATCH] Implementation of PortletRequest#getQueryString()
+ https://issues.apache.org/jira/browse/COCOON-1394
+COCOON-1384 [PATCH] flow redirector should allow explicit 'cocoon:' scheme
+ https://issues.apache.org/jira/browse/COCOON-1384
+COCOON-1370 [PATCH] proxy block can now use JTidy and handle multipart POST
+ https://issues.apache.org/jira/browse/COCOON-1370
+COCOON-1368 [PATCH] HTTPRequestTransformer
+ https://issues.apache.org/jira/browse/COCOON-1368
+COCOON-1362 [PATCH] log4j.xconf should have the same default config as logkit.xconf
+ https://issues.apache.org/jira/browse/COCOON-1362
+COCOON-1360 [patch] client side validation for CForms
+ https://issues.apache.org/jira/browse/COCOON-1360
+COCOON-1345 [PATCH] Extract convertors into their own block
+ https://issues.apache.org/jira/browse/COCOON-1345
+COCOON-1340 [PATCH] lucene block contribution : a AnalyzerManager component
+ https://issues.apache.org/jira/browse/COCOON-1340
+COCOON-1337 [PATCH] Suggestion for widget population
+ https://issues.apache.org/jira/browse/COCOON-1337
+COCOON-1336 [PATCH] PortletWindowAspect: hiding portlet mode icons and new feature "force-sizable"
+ https://issues.apache.org/jira/browse/COCOON-1336
+COCOON-1332 [PATCH] content-length and content-type for portlet ActionRequest
+ https://issues.apache.org/jira/browse/COCOON-1332
+COCOON-1329 [PATCH] Fix for cocoon.jar bundled in ear common for portal.war and portlet.war
+ https://issues.apache.org/jira/browse/COCOON-1329
+COCOON-1325 [PATCH] commons-fileupload based multipart parser
+ https://issues.apache.org/jira/browse/COCOON-1325
+COCOON-1302 [Patch] Word Document Generator
+ https://issues.apache.org/jira/browse/COCOON-1302
+COCOON-1295 ParallelContentAggregator, multithreaded aggregating
+ https://issues.apache.org/jira/browse/COCOON-1295
+COCOON-1260 [PATCH] MultipartParser can now handle multipart/mixed
+ https://issues.apache.org/jira/browse/COCOON-1260
+COCOON-1254 [Patch] OWQLTransformer + RDQLTransformer
+ https://issues.apache.org/jira/browse/COCOON-1254
+COCOON-1249 [Patch] XMLDBSource should accept scheme://user:***@host:port/path URIs
+ https://issues.apache.org/jira/browse/COCOON-1249
+COCOON-1232 [PATCH] NEW--ModuleDB Action for ORACLE( auto. increment )
+ https://issues.apache.org/jira/browse/COCOON-1232
+COCOON-1203 [PATCH] inserver junit testing
+ https://issues.apache.org/jira/browse/COCOON-1203
+COCOON-1200 [PATCH] XML CSS engine
+ https://issues.apache.org/jira/browse/COCOON-1200
+COCOON-1185 [PATCH] BerkeleyDBStore
+ https://issues.apache.org/jira/browse/COCOON-1185
+COCOON-1147 [PATCH] namespace issues with XMLDBTransformer
+ https://issues.apache.org/jira/browse/COCOON-1147
+COCOON-1125 [PATCH] Updated CastorTransformer + samples
+ https://issues.apache.org/jira/browse/COCOON-1125
+COCOON-1027 [PATCH] CocoonBean add additional features for reprocessing pipelines and interrupt processing
+ https://issues.apache.org/jira/browse/COCOON-1027
+COCOON-996 [PATCH] LuceneIndexContentHandler.java produces CLOBs
+ https://issues.apache.org/jira/browse/COCOON-996
+COCOON-988 [PATCH] StreamGenerator can't handle multipart request parameters correctly
+ https://issues.apache.org/jira/browse/COCOON-988
+COCOON-881 [PATCH] file upload component for usage with flowscript
+ https://issues.apache.org/jira/browse/COCOON-881
+COCOON-871 [PATCH] XML posting from SourceWritingTransformer by using an enhanced HTTPClientSource
+ https://issues.apache.org/jira/browse/COCOON-871
+COCOON-867 [PATCH] wsinclude and htmlinclude transformers
+ https://issues.apache.org/jira/browse/COCOON-867
+COCOON-865 [PATCH] New ResourceLoadAction
+ https://issues.apache.org/jira/browse/COCOON-865
+COCOON-844 [PATCH] adding <wd:on-phase> and moving load() and save() to Form.
+ https://issues.apache.org/jira/browse/COCOON-844
+COCOON-825 [PATCH] Fix Bug: Better handling of CLOB in esql (get-xml) and handling of Oracle 'temporary lobs'
+ https://issues.apache.org/jira/browse/COCOON-825
+COCOON-719 [PATCH] Support for transactions in SQLTransformer
+ https://issues.apache.org/jira/browse/COCOON-719
+COCOON-717 [PATCH] Namespace cleanup in HTMLSerializer
+ https://issues.apache.org/jira/browse/COCOON-717
+COCOON-665 [PATCH] HSSFSerializer Support for FreezePane
+ https://issues.apache.org/jira/browse/COCOON-665
+
+You may edit this subscription at:
+https://issues.apache.org/jira/secure/FilterSubscription!default.jspa?subId=10311&filterId=12310771
+
+
+From dev-return-102533-apmail-cocoon-dev-archive=***@cocoon.apache.org Sat Sep 18 00:15:21 2010
+Return-Path: <dev-return-102533-apmail-cocoon-dev-archive=***@cocoon.apache.org>
+Delivered-To: apmail-cocoon-dev-***@www.apache.org
+Received: (qmail 70276 invoked from network); 18 Sep 2010 00:15:21 -0000
+Received: from unknown (HELO mail.apache.org) (140.211.11.3)
+ by 140.211.11.9 with SMTP; 18 Sep 2010 00:15:21 -0000
+Received: (qmail 17738 invoked by uid 500); 18 Sep 2010 00:15:20 -0000
+Delivered-To: apmail-cocoon-dev-***@cocoon.apache.org
+Received: (qmail 17581 invoked by uid 500); 18 Sep 2010 00:15:19 -0000
+Mailing-List: contact dev-***@cocoon.apache.org; run by ezmlm
+Precedence: bulk
+list-help: <mailto:dev-***@cocoon.apache.org>
+list-unsubscribe: <mailto:dev-***@cocoon.apache.org>
+List-Post: <mailto:***@cocoon.apache.org>
+Reply-To: ***@cocoon.apache.org
+List-Id: <dev.cocoon.apache.org>
+Delivered-To: mailing list ***@cocoon.apache.org
+Received: (qmail 17574 invoked by uid 99); 18 Sep 2010 00:15:19 -0000
+Received: from Unknown (HELO nike.apache.org) (192.87.106.230)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 18 Sep 2010 00:15:19 +0000
+X-ASF-Spam-Status: No, hits=-2000.0 required=10.0
+ tests=ALL_TRUSTED
+X-Spam-Check-By: apache.org
+Received: from [140.211.11.22] (HELO thor.apache.org) (140.211.11.22)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 18 Sep 2010 00:15:00 +0000
+Received: from thor (localhost [127.0.0.1])
+ by thor.apache.org (8.13.8+Sun/8.13.8) with ESMTP id o8I0EcCI022463
+ for <***@cocoon.apache.org>; Sat, 18 Sep 2010 00:14:39 GMT
+Message-ID: <***@thor>
+Date: Fri, 17 Sep 2010 20:14:38 -0400 (EDT)
+From: "Florent ANDRE (JIRA)" <***@apache.org>
+To: ***@cocoon.apache.org
+Subject: [jira] Created: (COCOON-2302) C2.2 : unable to find daisy-..-1.5
+ jars in rev 959219
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+C2.2 : unable to find daisy-..-1.5 jars in rev 959219
+-----------------------------------------------------
+
+ Key: COCOON-2302
+ URL: https://issues.apache.org/jira/browse/COCOON-2302
+ Project: Cocoon
+ Issue Type: Bug
+ Components: - Build System: Maven
+ Affects Versions: 2.2-dev (Current SVN)
+ Reporter: Florent ANDRE
+
+
+Hi,
+
+On a fresh co of cocoon trunk give me this errors when mvn install.
+
+Find this repository (http://daisycms.org/maven/maven2/dev/), but there is just 2.5 versions of libs.
+
+Ugrade dependencies to 2.5 or another 1.5 repository ?
+
+Thanks.
+
+INFO] Unable to find resource 'daisy:daisy-util:jar:1.5-dev' in repository gkossakowski-maven2 (http://people.apache.org/~gkossakowski/maven2/repository)
+[INFO] ------------------------------------------------------------------------
+[ERROR] BUILD ERROR
+[INFO] ------------------------------------------------------------------------
+[INFO] Failed to resolve artifact.
+
+Missing:
+----------
+1) daisy:daisy-repository-api:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-repository-api:jar:1.5-dev
+
+2) nekodtd:nekodtd:jar:0.1.11
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=nekodtd -DartifactId=nekodtd -Dversion=0.1.11 -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=nekodtd -DartifactId=nekodtd -Dversion=0.1.11 -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) nekodtd:nekodtd:jar:0.1.11
+
+3) daisy:daisy-repository-xmlschema-bindings:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-xmlschema-bindings -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-xmlschema-bindings -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-repository-xmlschema-bindings:jar:1.5-dev
+
+4) daisy:daisy-repository-client-impl:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-client-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-client-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-repository-client-impl:jar:1.5-dev
+
+5) daisy:daisy-repository-common-impl:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-common-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-common-impl -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-repository-common-impl:jar:1.5-dev
+
+6) daisy:daisy-repository-spi:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-repository-spi -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-repository-spi -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-repository-spi:jar:1.5-dev
+
+7) daisy:daisy-jmsclient-api:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-jmsclient-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-jmsclient-api -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-jmsclient-api:jar:1.5-dev
+
+8) daisy:daisy-htmlcleaner:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-htmlcleaner -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-htmlcleaner -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-htmlcleaner:jar:1.5-dev
+
+9) daisy:daisy-util:jar:1.5-dev
+
+ Try downloading the file manually from the project website.
+
+ Then, install it using the command:
+ mvn install:install-file -DgroupId=daisy -DartifactId=daisy-util -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file
+
+ Alternatively, if you host your own repository you can deploy the file there:
+ mvn deploy:deploy-file -DgroupId=daisy -DartifactId=daisy-util -Dversion=1.5-dev -Dpackaging=jar -Dfile=/path/to/file -Durl=[url] -DrepositoryId=[id]
+
+ Path to dependency:
+ 1) org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+ 2) daisy:daisy-util:jar:1.5-dev
+
+----------
+9 required artifacts are missing.
+
+for artifact:
+
+org.apache.cocoon:cocoon-sitemaptags2daisy-plugin:maven-plugin:1.0.0-SNAPSHOT
+
+from the specified remote repositories:
+ apache.snapshots (http://people.apache.org/repo/m2-snapshot-repository),
+ central (http://repo1.maven.org/maven2),
+ maven-snapshot (http://snapshots.maven.codehaus.org/maven2/),
+ cocoondev (http://cocoondev.org/repository),
+ gkossakowski-maven2 (http://people.apache.org/~gkossakowski/maven2/repository)
+
+
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
\ No newline at end of file

r***@apache.org

2018-06-27 14:52:09 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
new file mode 100644
index 0000000..6a8c659
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import java.io.IOException;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.google.common.base.Preconditions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.vectorizer.Weight;
+
+/**
+ * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source
+ * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have
+ * term vectors stored for it.
+ */
+public class LuceneIterator extends AbstractLuceneIterator {
+
+ protected final Set<String> idFieldSelector;
+ protected final String idField;
+
+ /**
+ * Produce a LuceneIterable that can create the Vector plus normalize it.
+ *
+ * @param indexReader {@link IndexReader} to read the documents from.
+ * @param idField field containing the id. May be null.
+ * @param field field to use for the Vector
+ * @param termInfo termInfo
+ * @param weight weight
+ * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
+ */
+ public LuceneIterator(IndexReader indexReader, String idField, String field, TermInfo termInfo, Weight weight,
+ double normPower) {
+ this(indexReader, idField, field, termInfo, weight, normPower, 0.0);
+ }
+
+ /**
+ * @param indexReader {@link IndexReader} to read the documents from.
+ * @param idField field containing the id. May be null.
+ * @param field field to use for the Vector
+ * @param termInfo termInfo
+ * @param weight weight
+ * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
+ * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
+ * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo,
+ * org.apache.mahout.vectorizer.Weight, double)
+ */
+ public LuceneIterator(IndexReader indexReader,
+ String idField,
+ String field,
+ TermInfo termInfo,
+ Weight weight,
+ double normPower,
+ double maxPercentErrorDocs) {
+ super(termInfo, normPower, indexReader, weight, maxPercentErrorDocs, field);
+ // term docs(null) is a better way of iterating all the docs in Lucene
+ Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0,
+ "normPower must be non-negative or -1, but normPower = " + normPower);
+ Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0,
+ "Must be: 0.0 <= maxPercentErrorDocs <= 1.0");
+ this.idField = idField;
+ if (idField != null) {
+ idFieldSelector = new TreeSet<>();
+ idFieldSelector.add(idField);
+ } else {
+ /*The field in the index containing the index. If null, then the Lucene internal doc id is used
+ which is prone to error if the underlying index changes*/
+ idFieldSelector = null;
+ }
+ }
+
+ @Override
+ protected String getVectorName(int documentIndex) throws IOException {
+ String name;
+ if (idField != null) {
+ name = indexReader.document(documentIndex, idFieldSelector).get(idField);
+ } else {
+ name = String.valueOf(documentIndex);
+ }
+ return name;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
new file mode 100644
index 0000000..5830ccc
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.vectorizer.Weight;
+
+
+/**
+ * Not thread-safe
+ */
+public class TFDFMapper {
+
+ private Vector vector;
+
+ private final Weight weight;
+ private long numTerms;
+ private final TermInfo termInfo;
+ private String field;
+ private final int numDocs;
+
+ public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo) {
+ this.weight = weight;
+ this.termInfo = termInfo;
+ this.numDocs = numDocs;
+ }
+
+ public void setExpectations(String field, long numTerms) {
+ this.field = field;
+ vector = new RandomAccessSparseVector(termInfo.totalTerms(field));
+ this.numTerms = numTerms;
+ }
+
+ public void map(BytesRef term, int frequency) {
+ TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString());
+ if (entry != null) {
+ vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int)numTerms, numDocs));
+ }
+ }
+
+ public Vector getVector() {
+ return this.vector;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
new file mode 100644
index 0000000..b0311c7
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import org.apache.mahout.common.RandomUtils;
+
+class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> {
+
+ private final String term;
+ private final int inClusterDF;
+ private final int outClusterDF;
+ private final double logLikelihoodRatio;
+
+ TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) {
+ this.term = term;
+ this.inClusterDF = inClusterDF;
+ this.outClusterDF = outClusterDF;
+ this.logLikelihoodRatio = logLikelihoodRatio;
+ }
+
+ @Override
+ public int hashCode() {
+ return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof TermInfoClusterInOut)) {
+ return false;
+ }
+ TermInfoClusterInOut other = (TermInfoClusterInOut) o;
+ return term.equals(other.getTerm())
+ && inClusterDF == other.getInClusterDF()
+ && outClusterDF == other.getOutClusterDF()
+ && logLikelihoodRatio == other.getLogLikelihoodRatio();
+ }
+
+ @Override
+ public int compareTo(TermInfoClusterInOut that) {
+ int res = Double.compare(that.logLikelihoodRatio, logLikelihoodRatio);
+ if (res == 0) {
+ res = term.compareTo(that.term);
+ }
+ return res;
+ }
+
+ public int getInClusterDiff() {
+ return this.inClusterDF - this.outClusterDF;
+ }
+
+ String getTerm() {
+ return term;
+ }
+
+ int getInClusterDF() {
+ return inClusterDF;
+ }
+
+ int getOutClusterDF() {
+ return outClusterDF;
+ }
+
+ double getLogLikelihoodRatio() {
+ return logLikelihoodRatio;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
new file mode 100644
index 0000000..463a45f
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.jdbc;
+
+import org.apache.mahout.cf.taste.impl.TasteTestCase;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.easymock.EasyMock;
+import org.junit.Test;
+
+import javax.sql.DataSource;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+
+public class MySQLJDBCInMemoryItemSimilarityTest extends TasteTestCase {
+
+ @Test
+ public void testMemoryLoad() throws Exception {
+
+ DataSource dataSource = EasyMock.createMock(DataSource.class);
+ Connection connection = EasyMock.createMock(Connection.class);
+ PreparedStatement statement = EasyMock.createMock(PreparedStatement.class);
+ ResultSet resultSet = EasyMock.createMock(ResultSet.class);
+
+ EasyMock.expect(dataSource.getConnection()).andReturn(connection);
+ EasyMock.expect(connection.prepareStatement(MySQLJDBCInMemoryItemSimilarity.DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL,
+ ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)).andReturn(statement);
+ statement.setFetchDirection(ResultSet.FETCH_FORWARD);
+ EasyMock.expect(statement.executeQuery()).andReturn(resultSet);
+
+ EasyMock.expect(resultSet.next()).andReturn(true);
+
+ EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
+ EasyMock.expect(resultSet.getLong(2)).andReturn(2L);
+ EasyMock.expect(resultSet.getDouble(3)).andReturn(0.5);
+ EasyMock.expect(resultSet.next()).andReturn(true);
+
+ EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
+ EasyMock.expect(resultSet.getLong(2)).andReturn(3L);
+ EasyMock.expect(resultSet.getDouble(3)).andReturn(0.4);
+ EasyMock.expect(resultSet.next()).andReturn(true);
+
+ EasyMock.expect(resultSet.getLong(1)).andReturn(3L);
+ EasyMock.expect(resultSet.getLong(2)).andReturn(4L);
+ EasyMock.expect(resultSet.getDouble(3)).andReturn(0.1);
+
+ EasyMock.expect(resultSet.next()).andReturn(false);
+
+ resultSet.close();
+ statement.close();
+ connection.close();
+
+ EasyMock.replay(dataSource, connection, statement, resultSet);
+
+ ItemSimilarity similarity = new MySQLJDBCInMemoryItemSimilarity(dataSource);
+
+ assertEquals(0.5, similarity.itemSimilarity(1L, 2L), EPSILON);
+ assertEquals(0.4, similarity.itemSimilarity(1L, 3L), EPSILON);
+ assertEquals(0.1, similarity.itemSimilarity(3L, 4L), EPSILON);
+ assertTrue(Double.isNaN(similarity.itemSimilarity(1L, 4L)));
+
+ EasyMock.verify(dataSource, connection, statement, resultSet);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
new file mode 100644
index 0000000..01d46fc
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
+import org.apache.mahout.utils.vectors.lucene.LuceneIterable;
+import org.apache.mahout.vectorizer.TFIDF;
+import org.apache.mahout.vectorizer.Weight;
+import org.junit.Before;
+import org.junit.Test;
+
+public final class TestClusterDumper extends MahoutTestCase {
+
+ private static final String[] DOCS = {
+ "The quick red fox jumped over the lazy brown dogs.",
+ "The quick brown fox jumped over the lazy red dogs.",
+ "The quick red cat jumped over the lazy brown dogs.",
+ "The quick brown cat jumped over the lazy red dogs.",
+ "Mary had a little lamb whose fleece was white as snow.",
+ "Mary had a little goat whose fleece was white as snow.",
+ "Mary had a little lamb whose fleece was black as tar.",
+ "Dick had a little goat whose fleece was white as snow.",
+ "Moby Dick is a story of a whale and a man obsessed.",
+ "Moby Bob is a story of a walrus and a man obsessed.",
+ "Moby Dick is a story of a whale and a crazy man.",
+ "The robber wore a black fleece jacket and a baseball cap.",
+ "The robber wore a red fleece jacket and a baseball cap.",
+ "The robber wore a white fleece jacket and a baseball cap.",
+ "The English Springer Spaniel is the best of all dogs."};
+
+ private List<VectorWritable> sampleData;
+
+ private String[] termDictionary;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ Configuration conf = getConfiguration();
+ FileSystem fs = FileSystem.get(conf);
+ // Create test data
+ getSampleData(DOCS);
+ ClusteringTestUtils.writePointsToFile(sampleData, true,
+ getTestTempFilePath("testdata/file1"), fs, conf);
+ }
+
+ private void getSampleData(String[] docs2) throws IOException {
+ sampleData = new ArrayList<>();
+ RAMDirectory directory = new RAMDirectory();
+ try (IndexWriter writer = new IndexWriter(directory,
+ new IndexWriterConfig(new StandardAnalyzer()))){
+ for (int i = 0; i < docs2.length; i++) {
+ Document doc = new Document();
+ Field id = new StringField("id", "doc_" + i, Field.Store.YES);
+ doc.add(id);
+ // Store both position and offset information
+ FieldType fieldType = new FieldType();
+ fieldType.setStored(false);
+ fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ fieldType.setTokenized(true);
+ fieldType.setStoreTermVectors(true);
+ fieldType.setStoreTermVectorPositions(true);
+ fieldType.setStoreTermVectorOffsets(true);
+ fieldType.freeze();
+ Field text = new Field("content", docs2[i], fieldType);
+ doc.add(text);
+ writer.addDocument(doc);
+ }
+ }
+
+ IndexReader reader = DirectoryReader.open(directory);
+
+ Weight weight = new TFIDF();
+ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+
+ int numTerms = 0;
+ for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
+ it.next();
+ numTerms++;
+ }
+ termDictionary = new String[numTerms];
+ int i = 0;
+ for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
+ String term = it.next().getTerm();
+ termDictionary[i] = term;
+ System.out.println(i + " " + term);
+ i++;
+ }
+ Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content",
+ termInfo,weight);
+
+ i = 0;
+ for (Vector vector : iterable) {
+ assertNotNull(vector);
+ NamedVector namedVector;
+ if (vector instanceof NamedVector) {
+ // rename it for testing purposes
+ namedVector = new NamedVector(((NamedVector) vector).getDelegate(),
+ "P(" + i + ')');
+
+ } else {
+ namedVector = new NamedVector(vector, "P(" + i + ')');
+ }
+ System.out.println(AbstractCluster.formatVector(namedVector,
+ termDictionary));
+ sampleData.add(new VectorWritable(namedVector));
+ i++;
+ }
+ }
+
+ /**
+ * Return the path to the final iteration's clusters
+ */
+ private static Path finalClusterPath(Configuration conf, Path output,
+ int maxIterations) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+ for (int i = maxIterations; i >= 0; i--) {
+ Path clusters = new Path(output, "clusters-" + i + "-final");
+ if (fs.exists(clusters)) {
+ return clusters;
+ }
+ }
+ return null;
+ }
+
+ @Test
+ public void testKmeans() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ Path input = getTestTempFilePath("input");
+ Path output = getTestTempDirPath("output");
+ Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
+ Configuration conf = getConfiguration();
+ FileSystem fs = FileSystem.get(conf);
+ // Write test data to file
+ ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+ // Select initial centroids
+ RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+ // Run k-means
+ Path kMeansOutput = new Path(output, "kmeans");
+ KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false);
+ // Print out clusters
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+ output, 10), new Path(kMeansOutput, "clusteredPoints"));
+ clusterDumper.printClusters(termDictionary);
+ }
+
+ @Test
+ public void testJsonClusterDumper() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ Path input = getTestTempFilePath("input");
+ Path output = getTestTempDirPath("output");
+ Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
+ Configuration conf = getConfiguration();
+ FileSystem fs = FileSystem.get(conf);
+ // Write test data to file
+ ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+ // Select initial centroids
+ RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+ // Run k-means
+ Path kmeansOutput = new Path(output, "kmeans");
+ KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false);
+ // Print out clusters
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+ output, 10), new Path(kmeansOutput, "clusteredPoints"));
+ clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
+ clusterDumper.printClusters(termDictionary);
+ }
+
+ @Test
+ public void testFuzzyKmeans() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ Path input = getTestTempFilePath("input");
+ Path output = getTestTempDirPath("output");
+ Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
+ Configuration conf = getConfiguration();
+ FileSystem fs = FileSystem.get(conf);
+ // Write test data to file
+ ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+ // Select initial centroids
+ RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+ // Run k-means
+ Path kMeansOutput = new Path(output, "kmeans");
+ FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true,
+ true, 0, true);
+ // run ClusterDumper
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+ output, 10), new Path(kMeansOutput, "clusteredPoints"));
+ clusterDumper.printClusters(termDictionary);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
new file mode 100644
index 0000000..8a226a0
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
@@ -0,0 +1,321 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.VectorWritable;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public final class TestClusterEvaluator extends MahoutTestCase {
+
+ private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
+
+ private List<VectorWritable> referenceData = Lists.newArrayList();
+
+ private final List<VectorWritable> sampleData = Lists.newArrayList();
+
+ private Map<Integer,List<VectorWritable>> representativePoints;
+
+ private List<Cluster> clusters;
+
+ private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
+
+ private Configuration conf;
+
+ private FileSystem fs;
+
+ private Path testdata;
+
+ private Path output;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ conf = getConfiguration();
+ fs = FileSystem.get(conf);
+ testdata = getTestTempDirPath("testdata");
+ output = getTestTempDirPath("output");
+ // Create small reference data set
+ referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
+ // generate larger test data set for the clustering tests to chew on
+ generateSamples();
+ }
+
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sd
+ * double standard deviation of the samples
+ */
+ private void generateSamples(int num, double mx, double my, double sd) {
+ log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
+ for (int i = 0; i < num; i++) {
+ sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
+ UncommonDistributions.rNorm(my, sd)})));
+ }
+ }
+
+ private void generateSamples() {
+ generateSamples(500, 1, 1, 3);
+ generateSamples(300, 1, 0, 0.5);
+ generateSamples(300, 0, 2, 0.1);
+ }
+
+ private void printRepPoints(int numIterations) {
+ RepresentativePointsDriver.printRepresentativePoints(output, numIterations);
+ }
+
+ /**
+ * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
+ *
+ * @param dC
+ * a double cluster center offset
+ * @param dP
+ * a double representative point offset
+ * @param measure
+ * the DistanceMeasure
+ */
+ private void initData(double dC, double dP, DistanceMeasure measure) {
+ clusters = Lists.newArrayList();
+ clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
+ clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
+ clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
+ clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
+ representativePoints = Maps.newHashMap();
+ for (Cluster cluster : clusters) {
+ List<VectorWritable> points = Lists.newArrayList();
+ representativePoints.put(cluster.getId(), points);
+ points.add(new VectorWritable(cluster.getCenter().clone()));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
+ }
+ }
+
+ @Test
+ public void testRepresentativePoints() throws Exception {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ Configuration conf = getConfiguration();
+ // run using MR reference point calculation
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
+ int numIterations = 2;
+ Path clustersIn = new Path(output, "clusters-0-final");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
+ numIterations, false);
+ printRepPoints(numIterations);
+ ClusterEvaluator evaluatorMR = new ClusterEvaluator(conf, clustersIn);
+ // now run again using sequential reference point calculation
+ HadoopUtil.delete(conf, output);
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
+ numIterations, true);
+ printRepPoints(numIterations);
+ ClusterEvaluator evaluatorSeq = new ClusterEvaluator(conf, clustersIn);
+ // compare results
+ assertEquals("InterCluster Density", evaluatorMR.interClusterDensity(), evaluatorSeq.interClusterDensity(), EPSILON);
+ assertEquals("IntraCluster Density", evaluatorMR.intraClusterDensity(), evaluatorSeq.intraClusterDensity(), EPSILON);
+ }
+
+ @Test
+ public void testCluster0() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+ assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+ assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+ }
+
+ @Test
+ public void testCluster1() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.5, measure);
+ ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+ assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+ assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+ }
+
+ @Test
+ public void testCluster2() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.75, measure);
+ ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+ assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+ assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+ }
+
+ /**
+ * adding an empty cluster should modify the inter cluster density but not change the intra-cluster density as that
+ * cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testEmptyCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = Lists.newArrayList();
+ representativePoints.put(cluster.getId(), points);
+ ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+ assertEquals("inter cluster density", 0.371534146934532, evaluator.interClusterDensity(), EPSILON);
+ assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+ }
+
+ /**
+ * adding an single-valued cluster should modify the inter cluster density but not change the intra-cluster density as
+ * that cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testSingleValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = Lists.newArrayList();
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
+ representativePoints.put(cluster.getId(), points);
+ ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+ assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
+ assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+ }
+
+ /**
+ * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
+ * clusters are included in the inter-cluster density but their NaN intra-density values are ignored by the evaluator.
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testAllSameValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = Lists.newArrayList();
+ points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(cluster.getCenter()));
+ representativePoints.put(cluster.getId(), points);
+ ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+ assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
+ assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+ }
+
+ @Test
+ public void testCanopy() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ Configuration conf = getConfiguration();
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
+ int numIterations = 10;
+ Path clustersIn = new Path(output, "clusters-0-final");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
+ numIterations, true);
+ //printRepPoints(numIterations);
+ ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+ // now print out the Results
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ }
+
+ @Test
+ public void testKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ // now run the Canopy job to prime kMeans canopies
+ Configuration conf = getConfiguration();
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
+ // now run the KMeans job
+ Path kmeansOutput = new Path(output, "kmeans");
+ KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
+ int numIterations = 10;
+ Path clustersIn = new Path(kmeansOutput, "clusters-2");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
+ numIterations, true);
+ RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
+ ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+ // now print out the Results
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ }
+
+ @Test
+ public void testFuzzyKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ // now run the Canopy job to prime kMeans canopies
+ Configuration conf = getConfiguration();
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
+ Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
+ // now run the KMeans job
+ FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
+ true, true, 0, true);
+ int numIterations = 10;
+ Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
+ measure, numIterations, true);
+ RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
+ ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+ // now print out the Results
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
new file mode 100644
index 0000000..597ed01
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
@@ -0,0 +1,326 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.cdbw;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.clustering.TestClusterEvaluator;
+import org.apache.mahout.clustering.UncommonDistributions;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class TestCDbwEvaluator extends MahoutTestCase {
+
+ private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
+
+ private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
+
+ private Map<Integer,List<VectorWritable>> representativePoints;
+
+ private List<Cluster> clusters;
+
+ private Configuration conf;
+
+ private FileSystem fs;
+
+ private final Collection<VectorWritable> sampleData = new ArrayList<>();
+
+ private List<VectorWritable> referenceData = new ArrayList<>();
+
+ private Path testdata;
+
+ private Path output;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ conf = getConfiguration();
+ fs = FileSystem.get(conf);
+ testdata = getTestTempDirPath("testdata");
+ output = getTestTempDirPath("output");
+ // Create small reference data set
+ referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
+ // generate larger test data set for the clustering tests to chew on
+ generateSamples();
+ }
+
+ /**
+ * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
+ *
+ * @param dC
+ * a double cluster center offset
+ * @param dP
+ * a double representative point offset
+ * @param measure
+ * the DistanceMeasure
+ */
+ private void initData(double dC, double dP, DistanceMeasure measure) {
+ clusters = new ArrayList<>();
+ clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
+ clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
+ clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
+ clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
+ representativePoints = new HashMap<>();
+ for (Cluster cluster : clusters) {
+ List<VectorWritable> points = new ArrayList<>();
+ representativePoints.put(cluster.getId(), points);
+ points.add(new VectorWritable(cluster.getCenter().clone()));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
+ }
+ }
+
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sd
+ * double standard deviation of the samples
+ */
+ private void generateSamples(int num, double mx, double my, double sd) {
+ log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
+ for (int i = 0; i < num; i++) {
+ sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
+ UncommonDistributions.rNorm(my, sd)})));
+ }
+ }
+
+ private void generateSamples() {
+ generateSamples(500, 1, 1, 3);
+ generateSamples(300, 1, 0, 0.5);
+ generateSamples(300, 0, 2, 0.1);
+ }
+
+ @Test
+ public void testCDbw0() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ @Test
+ public void testCDbw1() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.5, measure);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ @Test
+ public void testCDbw2() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.75, measure);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ @Test
+ public void testEmptyCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = new ArrayList<>();
+ representativePoints.put(cluster.getId(), points);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ @Test
+ public void testSingleValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = new ArrayList<>();
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
+ representativePoints.put(cluster.getId(), points);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ /**
+ * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
+ * clusters should be ignored like empty clusters above
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testAllSameValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = new ArrayList<>();
+ points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(cluster.getCenter()));
+ representativePoints.put(cluster.getId(), points);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ /**
+ * Clustering can produce very, very tight clusters that can cause the std calculation to fail. These clusters should
+ * be processed correctly.
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testAlmostSameValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = new ArrayList<>();
+ Vector delta = new DenseVector(new double[] {0, Double.MIN_NORMAL});
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
+ representativePoints.put(cluster.getId(), points);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ @Test
+ public void testCanopy() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, true, 0.0, true);
+ int numIterations = 10;
+ Path clustersIn = new Path(output, "clusters-0-final");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
+ numIterations, true);
+ CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
+ // printRepPoints(numIterations);
+ // now print out the Results
+ System.out.println("Canopy CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ @Test
+ public void testKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ // now run the Canopy job to prime kMeans canopies
+ CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
+ // now run the KMeans job
+ Path kmeansOutput = new Path(output, "kmeans");
+ KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
+ int numIterations = 10;
+ Path clustersIn = new Path(kmeansOutput, "clusters-10-final");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
+ numIterations, true);
+ CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
+ RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
+ // now print out the Results
+ System.out.println("K-Means CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+ @Test
+ public void testFuzzyKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ // now run the Canopy job to prime kMeans canopies
+ CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
+ Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
+ // now run the KMeans job
+ FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
+ true, true, 0, true);
+ int numIterations = 10;
+ Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
+ measure, numIterations, true);
+ CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
+ RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
+ // now print out the Results
+ System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw());
+ System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+ System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+ System.out.println("Separation = " + evaluator.separation());
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
new file mode 100644
index 0000000..ba73c82
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+/**
+ * Unit tests for the MailArchivesClusteringAnalyzer text analyzer.
+ */
+public class MailArchivesClusteringAnalyzerTest extends MahoutTestCase {
+
+ @Test
+ public void testAnalysis() throws Exception {
+ Analyzer analyzer = new MailArchivesClusteringAnalyzer();
+
+ String text = "A test message\n"
+ + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
+ + "Mahout is a scalable, machine-learning LIBRARY\n"
+ + "we've added some additional stopwords such as html, mailto, regards\t"
+ + "apache_hadoop provides the foundation for scalability\n"
+ + "www.nabble.com general-***@incubator.apache.org\n"
+ + "public void int protected package";
+ Reader reader = new StringReader(text);
+
+ // if you change the text above, then you may need to change this as well
+ // order matters too
+ String[] expectedTokens = {
+ "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
+ "stopword", "apache_hadoop","provid", "foundat", "scalabl"
+ };
+
+ TokenStream tokenStream = analyzer.tokenStream("test", reader);
+ assertNotNull(tokenStream);
+ tokenStream.reset();
+ CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
+ int e = 0;
+ while (tokenStream.incrementToken() && e < expectedTokens.length) {
+ assertEquals(expectedTokens[e++], termAtt.toString());
+ }
+ assertEquals(e, expectedTokens.length);
+ tokenStream.end();
+ tokenStream.close();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
new file mode 100644
index 0000000..ef2b8a6
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
@@ -0,0 +1,240 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.commons.lang3.SystemUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test case for the SequenceFilesFromMailArchives command-line application.
+ */
+public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase {
+
+ private File inputDir;
+
+ /**
+ * Create the input and output directories needed for testing
+ * the SequenceFilesFromMailArchives application.
+ */
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ inputDir = getTestTempDir("mail-archives-in");
+
+ // write test mail messages to a gzipped file in a nested directory
+ File subDir = new File(inputDir, "subdir");
+ subDir.mkdir();
+ File gzFile = new File(subDir, "mail-messages.gz");
+ try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) {
+ gzOut.write(testMailMessages.getBytes("UTF-8"));
+ gzOut.finish();
+ }
+
+ File subDir2 = new File(subDir, "subsubdir");
+ subDir2.mkdir();
+ File gzFile2 = new File(subDir2, "mail-messages-2.gz");
+ try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) {
+ gzOut.write(testMailMessages.getBytes("UTF-8"));
+ gzOut.finish();
+ }
+ }
+
+ @Test
+ public void testSequential() throws Exception {
+
+ File outputDir = this.getTestTempDir("mail-archives-out");
+
+ String[] args = {
+ "--input", inputDir.getAbsolutePath(),
+ "--output", outputDir.getAbsolutePath(),
+ "--charset", "UTF-8",
+ "--keyPrefix", "TEST",
+ "--method", "sequential",
+ "--body", "--subject", "--separator", ""
+ };
+
+ // run the application's main method
+ SequenceFilesFromMailArchives.main(args);
+
+ // app should create a single SequenceFile named "chunk-0" in the output dir
+ File expectedChunkFile = new File(outputDir, "chunk-0");
+ String expectedChunkPath = expectedChunkFile.getAbsolutePath();
+ Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile());
+
+ Configuration conf = getConfiguration();
+ SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf);
+ Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
+ Pair<Text, Text> record = iterator.next();
+
+ File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
+ Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString());
+ Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
+
+ Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
+
+ record = iterator.next();
+ Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString());
+ Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
+
+ record = iterator.next();
+ File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
+ Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), record.getFirst().toString());
+ Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
+
+ Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
+ record = iterator.next();
+ Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), record.getFirst().toString());
+ Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
+
+ Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext());
+ }
+
+ @Test
+ public void testMapReduce() throws Exception {
+
+ Path tmpDir = getTestTempDirPath();
+ Path mrOutputDir = new Path(tmpDir, "mail-archives-out-mr");
+ Configuration configuration = getConfiguration();
+ FileSystem fs = FileSystem.get(configuration);
+
+ File expectedInputFile = new File(inputDir.toString());
+
+ String[] args = {
+ "-Dhadoop.tmp.dir=" + configuration.get("hadoop.tmp.dir"),
+ "--input", expectedInputFile.getAbsolutePath(),
+ "--output", mrOutputDir.toString(),
+ "--charset", "UTF-8",
+ "--keyPrefix", "TEST",
+ "--method", "mapreduce",
+ "--body", "--subject", "--separator", ""
+ };
+
+ // run the application's main method
+ SequenceFilesFromMailArchives.main(args);
+
+ // app should create a single SequenceFile named "chunk-0" in the output dir
+ FileStatus[] fileStatuses = fs.listStatus(mrOutputDir.suffix("/part-m-00000"));
+ assertEquals(1, fileStatuses.length); // only one
+ assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
+ SequenceFileIterator<Text, Text> iterator =
+ new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration);
+
+ Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
+ Pair<Text, Text> record = iterator.next();
+
+ File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
+
+ String expected = record.getFirst().toString();
+ if (SystemUtils.IS_OS_WINDOWS) {
+ expected = expected.replace("/", "\\");
+ }
+ Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), expected);
+ Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
+ Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
+
+ record = iterator.next();
+ expected = record.getFirst().toString();
+ if (SystemUtils.IS_OS_WINDOWS) {
+ expected = expected.replace("/", "\\");
+ }
+ Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), expected);
+ Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
+
+ // test other file
+ File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
+ record = iterator.next();
+ expected = record.getFirst().toString();
+ if (SystemUtils.IS_OS_WINDOWS) {
+ expected = expected.replace("/", "\\");
+ }
+ Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), expected);
+ Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
+ Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
+
+ record = iterator.next();
+ expected = record.getFirst().toString();
+ if (SystemUtils.IS_OS_WINDOWS) {
+ expected = expected.replace("/", "\\");
+ }
+ Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), expected);
+ Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
+ Assert.assertFalse("Only four key/value pairs expected!", iterator.hasNext());
+ }
+
+ // Messages extracted and made anonymous from the ASF mail archives
+ private static final String[][] testVars = {
+ new String[] {
+ "***@example.com",
+ "Ant task for JDK1.1 collections build option",
+ "\nThis is just a test message\n--\nTesty McTester\n"
+ },
+ new String[] {
+ "***@example.com",
+ "Problem with build files in several directories",
+ "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n"
+ }
+ };
+
+ private static final String testMailMessages =
+ "From ***@example.com Mon Jul 24 19:13:53 2000\n"
+ + "Return-Path: <***@example.com>\n"
+ + "Mailing-List: contact ant-user-***@jakarta.apache.org; run by ezmlm\n"
+ + "Delivered-To: mailing list ant-***@jakarta.apache.org\n"
+ + "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n"
+ + "Message-ID: <" + testVars[0][0] + ">\n"
+ + "From: \"Testy McTester\" <***@example.com>\n"
+ + "To: <ant-***@jakarta.apache.org>\n"
+ + "Subject: " + testVars[0][1] + '\n'
+ + "Date: Mon, 24 Jul 2000 12:24:56 -0700\n"
+ + "MIME-Version: 1.0\n"
+ + "Content-Type: text/plain;\n"
+ + " charset=\"Windows-1252\"\n"
+ + "Content-Transfer-Encoding: 7bit\n"
+ + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
+ + testVars[0][2] + '\n'
+ + "From ***@example.com Wed Jul 26 11:32:16 2000\n"
+ + "Return-Path: <***@example.com>\n"
+ + "Mailing-List: contact ant-user-***@jakarta.apache.org; run by ezmlm\n"
+ + "Delivered-To: mailing list ant-***@jakarta.apache.org\n"
+ + "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n"
+ + "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n"
+ + "Date: Wed, 26 Jul 2000 13:32:08 +0200\n"
+ + "Subject: " + testVars[1][1] + '\n'
+ + "From: Another Test <***@example.com>\n"
+ + "To: <ant-***@jakarta.apache.org>\n"
+ + "Message-Id: <" + testVars[1][0] + ">\n"
+ + "Mime-Version: 1.0\n"
+ + "Content-Type: text/plain; charset=\"US-ASCII\"\n"
+ + "Content-Transfer-Encoding: 7bit\n"
+ + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
+ + testVars[1][2];
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
new file mode 100644
index 0000000..227521a
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+/**
+ * Dummy Path Filter for testing the MapReduce version of
+ * SequenceFilesFromDirectory
+ */
+public class TestPathFilter implements PathFilter {
+
+ @Override
+ public boolean accept(Path path) {
+ return path.getName().startsWith("t") || path.getName().startsWith("r") || path.getName().startsWith("f");
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
new file mode 100644
index 0000000..040c8e4
--- /dev/null
+++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
@@ -0,0 +1,313 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class TestSequenceFilesFromDirectory extends MahoutTestCase {
+
+ private static final Logger logger = LoggerFactory.getLogger(TestSequenceFilesFromDirectory.class);
+
+ private static final String[][] DATA1 = {
+ {"test1", "This is the first text."},
+ {"test2", "This is the second text."},
+ {"test3", "This is the third text."}
+ };
+
+ private static final String[][] DATA2 = {
+ {"recursive_test1", "This is the first text."},
+ {"recursive_test2", "This is the second text."},
+ {"recursive_test3", "This is the third text."}
+ };
+
+ @Test
+ public void testSequenceFileFromDirectoryBasic() throws Exception {
+ // parameters
+ Configuration configuration = getConfiguration();
+
+ FileSystem fs = FileSystem.get(configuration);
+
+ // create
+ Path tmpDir = this.getTestTempDirPath();
+ Path inputDir = new Path(tmpDir, "inputDir");
+ fs.mkdirs(inputDir);
+
+ Path outputDir = new Path(tmpDir, "outputDir");
+ Path outputDirRecursive = new Path(tmpDir, "outputDirRecursive");
+
+ Path inputDirRecursive = new Path(tmpDir, "inputDirRecur");
+ fs.mkdirs(inputDirRecursive);
+
+ // prepare input files
+ createFilesFromArrays(configuration, inputDir, DATA1);
+
+ SequenceFilesFromDirectory.main(new String[]{
+ "--input", inputDir.toString(),
+ "--output", outputDir.toString(),
+ "--chunkSize", "64",
+ "--charset", Charsets.UTF_8.name(),
+ "--keyPrefix", "UID",
+ "--method", "sequential"});
+
+ // check output chunk files
+ checkChunkFiles(configuration, outputDir, DATA1, "UID");
+
+ createRecursiveDirFilesFromArrays(configuration, inputDirRecursive, DATA2);
+
+ FileStatus fstInputPath = fs.getFileStatus(inputDirRecursive);
+ String dirs = HadoopUtil.buildDirList(fs, fstInputPath);
+
+ System.out.println("\n\n ----- recursive dirs: " + dirs);
+ SequenceFilesFromDirectory.main(new String[]{
+ "--input", inputDirRecursive.toString(),
+ "--output", outputDirRecursive.toString(),
+ "--chunkSize", "64",
+ "--charset", Charsets.UTF_8.name(),
+ "--keyPrefix", "UID",
+ "--method", "sequential"});
+
+ checkRecursiveChunkFiles(configuration, outputDirRecursive, DATA2, "UID");
+ }
+
+ @Test
+ public void testSequenceFileFromDirectoryMapReduce() throws Exception {
+
+ Configuration conf = getConfiguration();
+
+ FileSystem fs = FileSystem.get(conf);
+
+ // create
+ Path tmpDir = this.getTestTempDirPath();
+ Path inputDir = new Path(tmpDir, "inputDir");
+ fs.mkdirs(inputDir);
+
+ Path inputDirRecur = new Path(tmpDir, "inputDirRecur");
+ fs.mkdirs(inputDirRecur);
+
+ Path mrOutputDir = new Path(tmpDir, "mrOutputDir");
+ Path mrOutputDirRecur = new Path(tmpDir, "mrOutputDirRecur");
+
+ createFilesFromArrays(conf, inputDir, DATA1);
+
+ SequenceFilesFromDirectory.main(new String[]{
+ "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
+ "--input", inputDir.toString(),
+ "--output", mrOutputDir.toString(),
+ "--chunkSize", "64",
+ "--charset", Charsets.UTF_8.name(),
+ "--method", "mapreduce",
+ "--keyPrefix", "UID",
+ "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
+ });
+
+ checkMRResultFiles(conf, mrOutputDir, DATA1, "UID");
+
+ createRecursiveDirFilesFromArrays(conf, inputDirRecur, DATA2);
+
+ FileStatus fst_input_path = fs.getFileStatus(inputDirRecur);
+ String dirs = HadoopUtil.buildDirList(fs, fst_input_path);
+
+ logger.info("\n\n ---- recursive dirs: {}", dirs);
+
+ SequenceFilesFromDirectory.main(new String[]{
+ "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
+ "--input", inputDirRecur.toString(),
+ "--output", mrOutputDirRecur.toString(),
+ "--chunkSize", "64",
+ "--charset", Charsets.UTF_8.name(),
+ "--method", "mapreduce",
+ "--keyPrefix", "UID",
+ "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
+ });
+
+ checkMRResultFilesRecursive(conf, mrOutputDirRecur, DATA2, "UID");
+ }
+
+
+ private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+ for (String[] aData : data) {
+ try (OutputStreamWriter writer =
+ new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8)){
+ writer.write(aData[1]);
+ }
+ }
+ }
+
+ private static void createRecursiveDirFilesFromArrays(Configuration configuration, Path inputDir,
+ String[][] data) throws IOException {
+ FileSystem fs = FileSystem.get(configuration);
+
+ logger.info("creativeRecursiveDirFilesFromArrays > based on: {}", inputDir.toString());
+ Path curPath;
+ String currentRecursiveDir = inputDir.toString();
+
+ for (String[] aData : data) {
+ currentRecursiveDir += "/" + aData[0];
+ File subDir = new File(currentRecursiveDir);
+ subDir.mkdir();
+
+ curPath = new Path(subDir.toString(), "file.txt");
+ logger.info("Created file: {}", curPath.toString());
+
+ try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(curPath), Charsets.UTF_8)){
+ writer.write(aData[1]);
+ }
+ }
+ }
+
+ private static void checkChunkFiles(Configuration configuration,
+ Path outputDir,
+ String[][] data,
+ String prefix) throws IOException {
+ FileSystem fs = FileSystem.get(configuration);
+
+ // output exists?
+ FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
+ assertEquals(1, fileStatuses.length); // only one
+ assertEquals("chunk-0", fileStatuses[0].getPath().getName());
+
+ Map<String, String> fileToData = new HashMap<>();
+ for (String[] aData : data) {
+ fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
+ }
+
+ // read a chunk to check content
+ try (SequenceFileIterator<Text, Text> iterator =
+ new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)){
+ while (iterator.hasNext()) {
+ Pair<Text, Text> record = iterator.next();
+ String retrievedData = fileToData.get(record.getFirst().toString().trim());
+ assertNotNull(retrievedData);
+ assertEquals(retrievedData, record.getSecond().toString().trim());
+ }
+ }
+ }
+
+ private static void checkRecursiveChunkFiles(Configuration configuration,
+ Path outputDir,
+ String[][] data,
+ String prefix) throws IOException {
+ FileSystem fs = FileSystem.get(configuration);
+
+ System.out.println(" ----------- check_Recursive_ChunkFiles ------------");
+
+ // output exists?
+ FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
+ assertEquals(1, fileStatuses.length); // only one
+ assertEquals("chunk-0", fileStatuses[0].getPath().getName());
+
+
+ Map<String, String> fileToData = new HashMap<>();
+ String currentPath = prefix;
+ for (String[] aData : data) {
+ currentPath += Path.SEPARATOR + aData[0];
+ fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
+ }
+
+ // read a chunk to check content
+ try (SequenceFileIterator<Text, Text> iterator =
+ new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)) {
+ while (iterator.hasNext()) {
+ Pair<Text, Text> record = iterator.next();
+ String retrievedData = fileToData.get(record.getFirst().toString().trim());
+ System.out.printf("%s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
+
+ assertNotNull(retrievedData);
+ assertEquals(retrievedData, record.getSecond().toString().trim());
+ System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(), record.getSecond().toString());
+ }
+ }
+ }
+
+ private static void checkMRResultFiles(Configuration conf, Path outputDir,
+ String[][] data, String prefix) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+
+ // output exists?
+ FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
+ assertEquals(1, fileStatuses.length); // only one
+ assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
+ Map<String, String> fileToData = new HashMap<>();
+ for (String[] aData : data) {
+ System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]);
+ fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
+ }
+
+ // read a chunk to check content
+ try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
+ fileStatuses[0].getPath(), true, conf)) {
+ while (iterator.hasNext()) {
+ Pair<Text, Text> record = iterator.next();
+ String retrievedData = fileToData.get(record.getFirst().toString().trim());
+
+ System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
+ assertNotNull(retrievedData);
+ assertEquals(retrievedData, record.getSecond().toString().trim());
+ }
+ }
+ }
+
+ private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir,
+ String[][] data, String prefix) throws IOException {
+ FileSystem fs = FileSystem.get(configuration);
+
+ // output exists?
+ FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
+ assertEquals(1, fileStatuses.length); // only one
+ assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
+ Map<String, String> fileToData = new HashMap<>();
+ String currentPath = prefix;
+
+ for (String[] aData : data) {
+ currentPath += Path.SEPARATOR + aData[0];
+ fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
+ }
+
+ // read a chunk to check content
+ try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
+ fileStatuses[0].getPath(), true, configuration)){
+ while (iterator.hasNext()) {
+ Pair<Text, Text> record = iterator.next();
+ System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim());
+ String retrievedData = fileToData.get(record.getFirst().toString().trim());
+ assertNotNull(retrievedData);
+ assertEquals(retrievedData, record.getSecond().toString().trim());
+ }
+ }
+ }
+}

r***@apache.org

2018-06-27 14:52:10 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
new file mode 100644
index 0000000..180a1e1
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Iterator;
+import java.util.Locale;
+
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s
+ * 
+ * Attribute type handling:
+ * <ul>
+ * <li>Numeric -> As is</li>
+ * <li>Nominal -> ordinal(value) i.e. @attribute lumber {'\'(-inf-0.5]\'','\'(0.5-inf)\''}
+ * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li>
+ * <li>Dates -> Convert to time as a long</li>
+ * <li>Strings -> Create a map of String -> long</li>
+ * </ul>
+ * NOTE: This class does not set the label bindings on every vector. If you want the label
+ * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are the same for every vector.
+ */
+public class ARFFVectorIterable implements Iterable<Vector> {
+
+ private final BufferedReader buff;
+ private final ARFFModel model;
+
+ public ARFFVectorIterable(File file, ARFFModel model) throws IOException {
+ this(file, Charsets.UTF_8, model);
+ }
+
+ public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) throws IOException {
+ this(Files.newReader(file, encoding), model);
+ }
+
+ public ARFFVectorIterable(String arff, ARFFModel model) throws IOException {
+ this(new StringReader(arff), model);
+ }
+
+ public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException {
+ if (reader instanceof BufferedReader) {
+ buff = (BufferedReader) reader;
+ } else {
+ buff = new BufferedReader(reader);
+ }
+ //grab the attributes, then start the iterator at the first line of data
+ this.model = model;
+
+ int labelNumber = 0;
+ String line;
+ while ((line = buff.readLine()) != null) {
+ line = line.trim();
+ if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) {
+ Integer labelNumInt = labelNumber;
+ String[] lineParts = line.split("[\\s\\t]+", 2);
+
+ // is it a relation name?
+ if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) {
+ model.setRelation(ARFFType.removeQuotes(lineParts[1]));
+ }
+ // or an attribute
+ else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) {
+ String label;
+ ARFFType type;
+
+ // split the name of the attribute and its description
+ String[] attrParts = lineParts[1].split("[\\s\\t]+", 2);
+ if (attrParts.length < 2)
+ throw new UnsupportedOperationException("No type for attribute found: " + lineParts[1]);
+
+ // label is attribute name
+ label = ARFFType.removeQuotes(attrParts[0].toLowerCase());
+ if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) {
+ type = ARFFType.NUMERIC;
+ } else if (attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) {
+ type = ARFFType.INTEGER;
+ } else if (attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) {
+ type = ARFFType.REAL;
+ } else if (attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) {
+ type = ARFFType.STRING;
+ } else if (attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) {
+ type = ARFFType.NOMINAL;
+ // nominal example:
+ // @ATTRIBUTE class {Iris-setosa,'Iris versicolor',Iris-virginica}
+ String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, attrParts[1].length() - 1));
+ for (int i = 0; i < classes.length; i++) {
+ model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
+ }
+ } else if (attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) {
+ type = ARFFType.DATE;
+ //TODO: DateFormatter map
+ DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
+ String formStr = attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim();
+ if (!formStr.isEmpty()) {
+ if (formStr.startsWith("\"")) {
+ formStr = formStr.substring(1, formStr.length() - 1);
+ }
+ format = new SimpleDateFormat(formStr, Locale.ENGLISH);
+ }
+ model.addDateFormat(labelNumInt, format);
+ //@attribute <name> date [<date-format>]
+ } else {
+ throw new UnsupportedOperationException("Invalid attribute: " + attrParts[1]);
+ }
+ model.addLabel(label, labelNumInt);
+ model.addType(labelNumInt, type);
+ labelNumber++;
+ } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) {
+ break; //skip it
+ }
+ }
+ }
+
+ }
+
+ @Override
+ public Iterator<Vector> iterator() {
+ return new ARFFIterator(buff, model);
+ }
+
+ /**
+ * Returns info about the ARFF content that was parsed.
+ *
+ * @return the model
+ */
+ public ARFFModel getModel() {
+ return model;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
new file mode 100644
index 0000000..ccecbb1
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
@@ -0,0 +1,263 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import com.google.common.io.Files;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
+import org.apache.mahout.utils.vectors.io.VectorWriter;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Driver {
+
+ private static final Logger log = LoggerFactory.getLogger(Driver.class);
+
+ /** used for JSON serialization/deserialization */
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private Driver() {
+ }
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder
+ .withLongName("input")
+ .withRequired(true)
+ .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The file or directory containing the ARFF files. If it is a directory, all .arff files will be converted")
+ .withShortName("d").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output directory. Files will have the same name as the input, but with the extension .mvc")
+ .withShortName("o").create();
+
+ Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
+ abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The maximum number of vectors to output. If not specified, then it will loop over all docs")
+ .withShortName("m").create();
+
+ Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
+ abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The file to output the label bindings").withShortName("t").create();
+
+ Option jsonDictonaryOpt = obuilder.withLongName("json-dictonary").withRequired(false)
+ .withDescription("Write dictonary in JSON format").withShortName("j").create();
+
+ Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
+ abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The delimiter for outputing the dictionary").withShortName("l").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
+ .withOption(helpOpt).withOption(dictOutOpt).withOption(jsonDictonaryOpt).withOption(delimiterOpt)
+ .create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+ if (cmdLine.hasOption(inputOpt)) { // Lucene case
+ File input = new File(cmdLine.getValue(inputOpt).toString());
+ long maxDocs = Long.MAX_VALUE;
+ if (cmdLine.hasOption(maxOpt)) {
+ maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
+ }
+ if (maxDocs < 0) {
+ throw new IllegalArgumentException("maxDocs must be >= 0");
+ }
+ String outDir = cmdLine.getValue(outputOpt).toString();
+ log.info("Output Dir: {}", outDir);
+
+ String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
+ File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
+ boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt);
+ ARFFModel model = new MapBackedARFFModel();
+ if (input.exists() && input.isDirectory()) {
+ File[] files = input.listFiles(new FilenameFilter() {
+ @Override
+ public boolean accept(File file, String name) {
+ return name.endsWith(".arff");
+ }
+ });
+
+ for (File file : files) {
+ writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary);
+ }
+ } else {
+ writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary);
+ }
+ }
+
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+
+ protected static void writeLabelBindings(File dictOut, ARFFModel arffModel, String delimiter, boolean jsonDictonary)
+ throws IOException {
+ try (Writer writer = Files.newWriterSupplier(dictOut, Charsets.UTF_8, true).getOutput()) {
+ if (jsonDictonary) {
+ writeLabelBindingsJSON(writer, arffModel);
+ } else {
+ writeLabelBindings(writer, arffModel, delimiter);
+ }
+ }
+ }
+
+ protected static void writeLabelBindingsJSON(Writer writer, ARFFModel arffModel) throws IOException {
+
+ // Turn the map of labels into a list order by order of appearance
+ List<Entry<String, Integer>> attributes = new ArrayList<>();
+ attributes.addAll(arffModel.getLabelBindings().entrySet());
+ Collections.sort(attributes, new Comparator<Map.Entry<String, Integer>>() {
+ @Override
+ public int compare(Entry<String, Integer> t, Entry<String, Integer> t1) {
+ return t.getValue().compareTo(t1.getValue());
+ }
+ });
+
+ // write a map for each object
+ List<Map<String, Object>> jsonObjects = new LinkedList<>();
+ for (int i = 0; i < attributes.size(); i++) {
+
+ Entry<String, Integer> modelRepresentation = attributes.get(i);
+ Map<String, Object> jsonRepresentation = new HashMap<>();
+ jsonObjects.add(jsonRepresentation);
+ // the last one is the class label
+ jsonRepresentation.put("label", i < (attributes.size() - 1) ? String.valueOf(false) : String.valueOf(true));
+ String attribute = modelRepresentation.getKey();
+ jsonRepresentation.put("attribute", attribute);
+ Map<String, Integer> nominalValues = arffModel.getNominalMap().get(attribute);
+
+ if (nominalValues != null) {
+ String[] values = nominalValues.keySet().toArray(new String[1]);
+
+ jsonRepresentation.put("values", values);
+ jsonRepresentation.put("type", "categorical");
+ } else {
+ jsonRepresentation.put("type", "numerical");
+ }
+ }
+ writer.write(OBJECT_MAPPER.writeValueAsString(jsonObjects));
+ }
+
+ protected static void writeLabelBindings(Writer writer, ARFFModel arffModel, String delimiter) throws IOException {
+
+ Map<String, Integer> labels = arffModel.getLabelBindings();
+ writer.write("Label bindings for Relation " + arffModel.getRelation() + '\n');
+ for (Map.Entry<String, Integer> entry : labels.entrySet()) {
+ writer.write(entry.getKey());
+ writer.write(delimiter);
+ writer.write(String.valueOf(entry.getValue()));
+ writer.write('\n');
+ }
+ writer.write('\n');
+ writer.write("Values for nominal attributes\n");
+ // emit allowed values for NOMINAL/categorical/enumerated attributes
+ Map<String, Map<String, Integer>> nominalMap = arffModel.getNominalMap();
+ // how many nominal attributes
+ writer.write(String.valueOf(nominalMap.size()) + "\n");
+
+ for (Entry<String, Map<String, Integer>> entry : nominalMap.entrySet()) {
+ // the label of this attribute
+ writer.write(entry.getKey() + "\n");
+ Set<Entry<String, Integer>> attributeValues = entry.getValue().entrySet();
+ // how many values does this attribute have
+ writer.write(attributeValues.size() + "\n");
+ for (Map.Entry<String, Integer> value : attributeValues) {
+ // the value and the value index
+ writer.write(String.format("%s%s%s\n", value.getKey(), delimiter, value.getValue().toString()));
+ }
+ }
+ }
+
+ protected static void writeFile(String outDir,
+ File file,
+ long maxDocs,
+ ARFFModel arffModel,
+ File dictOut,
+ String delimiter,
+ boolean jsonDictonary) throws IOException {
+ log.info("Converting File: {}", file);
+ ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1, arffModel
+ .getNominalMap());
+ Iterable<Vector> iteratable = new ARFFVectorIterable(file, model);
+ String outFile = outDir + '/' + file.getName() + ".mvc";
+
+ try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
+ long numDocs = vectorWriter.write(iteratable, maxDocs);
+ writeLabelBindings(dictOut, model, delimiter, jsonDictonary);
+ log.info("Wrote: {} vectors", numDocs);
+ }
+ }
+
+ private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
+ Path path = new Path(outFile);
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
+ VectorWritable.class);
+ return new SequenceFileVectorWriter(seqWriter);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
new file mode 100644
index 0000000..e911b1a
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
@@ -0,0 +1,282 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import java.text.DateFormat;
+import java.text.NumberFormat;
+import java.text.ParseException;
+import java.text.ParsePosition;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * Holds ARFF information in {@link Map}.
+ */
+public class MapBackedARFFModel implements ARFFModel {
+
+ private static final Pattern QUOTE_PATTERN = Pattern.compile("\"");
+
+ private long wordCount = 1;
+
+ private String relation;
+
+ private final Map<String,Integer> labelBindings;
+ private final Map<Integer,String> idxLabel;
+ private final Map<Integer,ARFFType> typeMap; // key is the vector index, value is the type
+ private final Map<Integer,DateFormat> dateMap;
+ private final Map<String,Map<String,Integer>> nominalMap;
+ private final Map<String,Long> words;
+
+ public MapBackedARFFModel() {
+ this(new HashMap<String,Long>(), 1, new HashMap<String,Map<String,Integer>>());
+ }
+
+ public MapBackedARFFModel(Map<String,Long> words, long wordCount, Map<String,Map<String,Integer>> nominalMap) {
+ this.words = words;
+ this.wordCount = wordCount;
+ labelBindings = new HashMap<>();
+ idxLabel = new HashMap<>();
+ typeMap = new HashMap<>();
+ dateMap = new HashMap<>();
+ this.nominalMap = nominalMap;
+
+ }
+
+ @Override
+ public String getRelation() {
+ return relation;
+ }
+
+ @Override
+ public void setRelation(String relation) {
+ this.relation = relation;
+ }
+
+ /**
+ * Convert a piece of String data at a specific spot into a value
+ *
+ * @param data
+ * The data to convert
+ * @param idx
+ * The position in the ARFF data
+ * @return A double representing the data
+ */
+ @Override
+ public double getValue(String data, int idx) {
+ ARFFType type = typeMap.get(idx);
+ if (type == null) {
+ throw new IllegalArgumentException("Attribute type cannot be NULL, attribute index was: " + idx);
+ }
+ data = QUOTE_PATTERN.matcher(data).replaceAll("");
+ data = data.trim();
+ double result;
+ switch (type) {
+ case NUMERIC:
+ case INTEGER:
+ case REAL:
+ result = processNumeric(data);
+ break;
+ case DATE:
+ result = processDate(data, idx);
+ break;
+ case STRING:
+ // may have quotes
+ result = processString(data);
+ break;
+ case NOMINAL:
+ String label = idxLabel.get(idx);
+ result = processNominal(label, data);
+ break;
+ default:
+ throw new IllegalStateException("Unknown type: " + type);
+ }
+ return result;
+ }
+
+ protected double processNominal(String label, String data) {
+ double result;
+ Map<String,Integer> classes = nominalMap.get(label);
+ if (classes != null) {
+ Integer ord = classes.get(ARFFType.removeQuotes(data));
+ if (ord != null) {
+ result = ord;
+ } else {
+ throw new IllegalStateException("Invalid nominal: " + data + " for label: " + label);
+ }
+ } else {
+ throw new IllegalArgumentException("Invalid nominal label: " + label + " Data: " + data);
+ }
+
+ return result;
+ }
+
+ // Not sure how scalable this is going to be
+ protected double processString(String data) {
+ data = QUOTE_PATTERN.matcher(data).replaceAll("");
+ // map it to an long
+ Long theLong = words.get(data);
+ if (theLong == null) {
+ theLong = wordCount++;
+ words.put(data, theLong);
+ }
+ return theLong;
+ }
+
+ protected static double processNumeric(String data) {
+ if (isNumeric(data)) {
+ return Double.parseDouble(data);
+ }
+ return Double.NaN;
+ }
+
+ public static boolean isNumeric(String str) {
+ NumberFormat formatter = NumberFormat.getInstance();
+ ParsePosition parsePosition = new ParsePosition(0);
+ formatter.parse(str, parsePosition);
+ return str.length() == parsePosition.getIndex();
+ }
+
+ protected double processDate(String data, int idx) {
+ DateFormat format = dateMap.get(idx);
+ if (format == null) {
+ format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
+ }
+ double result;
+ try {
+ Date date = format.parse(data);
+ result = date.getTime(); // hmmm, what kind of loss casting long to double?
+ } catch (ParseException e) {
+ throw new IllegalArgumentException(e);
+ }
+ return result;
+ }
+
+ /**
+ * The vector attributes (labels in Mahout speak), unmodifiable
+ *
+ * @return the map
+ */
+ @Override
+ public Map<String,Integer> getLabelBindings() {
+ return Collections.unmodifiableMap(labelBindings);
+ }
+
+ /**
+ * The map of types encountered
+ *
+ * @return the map
+ */
+ public Map<Integer,ARFFType> getTypeMap() {
+ return Collections.unmodifiableMap(typeMap);
+ }
+
+ /**
+ * Map of Date formatters used
+ *
+ * @return the map
+ */
+ public Map<Integer,DateFormat> getDateMap() {
+ return Collections.unmodifiableMap(dateMap);
+ }
+
+ /**
+ * Map nominals to ids. Should only be modified by calling {@link ARFFModel#addNominal(String, String, int)}
+ *
+ * @return the map
+ */
+ @Override
+ public Map<String,Map<String,Integer>> getNominalMap() {
+ return nominalMap;
+ }
+
+ /**
+ * Immutable map of words to the long id used for those words
+ *
+ * @return The map
+ */
+ @Override
+ public Map<String,Long> getWords() {
+ return words;
+ }
+
+ @Override
+ public Integer getNominalValue(String label, String nominal) {
+ return nominalMap.get(label).get(nominal);
+ }
+
+ @Override
+ public void addNominal(String label, String nominal, int idx) {
+ Map<String,Integer> noms = nominalMap.get(label);
+ if (noms == null) {
+ noms = new HashMap<>();
+ nominalMap.put(label, noms);
+ }
+ noms.put(nominal, idx);
+ }
+
+ @Override
+ public DateFormat getDateFormat(Integer idx) {
+ return dateMap.get(idx);
+ }
+
+ @Override
+ public void addDateFormat(Integer idx, DateFormat format) {
+ dateMap.put(idx, format);
+ }
+
+ @Override
+ public Integer getLabelIndex(String label) {
+ return labelBindings.get(label);
+ }
+
+ @Override
+ public void addLabel(String label, Integer idx) {
+ labelBindings.put(label, idx);
+ idxLabel.put(idx, label);
+ }
+
+ @Override
+ public ARFFType getARFFType(Integer idx) {
+ return typeMap.get(idx);
+ }
+
+ @Override
+ public void addType(Integer idx, ARFFType type) {
+ typeMap.put(idx, type);
+ }
+
+ /**
+ * The count of the number of words seen
+ *
+ * @return the count
+ */
+ @Override
+ public long getWordCount() {
+ return wordCount;
+ }
+
+ @Override
+ public int getLabelSize() {
+ return labelBindings.size();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
new file mode 100644
index 0000000..3c583fd
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.csv;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVStrategy;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
+ * 
+ * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()}
+ * method.
+ * 
+ * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
+ * 
+ * The Iterator is not thread-safe.
+ */
+public class CSVVectorIterator extends AbstractIterator<Vector> {
+
+ private final CSVParser parser;
+
+ public CSVVectorIterator(Reader reader) {
+ parser = new CSVParser(reader);
+ }
+
+ public CSVVectorIterator(Reader reader, CSVStrategy strategy) {
+ parser = new CSVParser(reader, strategy);
+ }
+
+ @Override
+ protected Vector computeNext() {
+ String[] line;
+ try {
+ line = parser.getLine();
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ if (line == null) {
+ return endOfData();
+ }
+ Vector result = new DenseVector(line.length);
+ for (int i = 0; i < line.length; i++) {
+ result.setQuick(i, Double.parseDouble(line[i]));
+ }
+ return result;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
new file mode 100644
index 0000000..b5f9f2b
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.io;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Iterator;
+
+import com.google.common.io.Closeables;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+
+/**
+ * Write {@link TermInfo} to a {@link Writer} in a textual, delimited format with header.
+ */
+public class DelimitedTermInfoWriter implements TermInfoWriter {
+
+ private final Writer writer;
+ private final String delimiter;
+ private final String field;
+
+ public DelimitedTermInfoWriter(Writer writer, String delimiter, String field) {
+ this.writer = writer;
+ this.delimiter = delimiter;
+ this.field = field;
+ }
+
+ @Override
+ public void write(TermInfo ti) throws IOException {
+
+ Iterator<TermEntry> entIter = ti.getAllEntries();
+ try {
+ writer.write(String.valueOf(ti.totalTerms(field)));
+ writer.write('\n');
+ writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
+ writer.write('\n');
+ while (entIter.hasNext()) {
+ TermEntry entry = entIter.next();
+ writer.write(entry.getTerm());
+ writer.write(delimiter);
+ writer.write(String.valueOf(entry.getDocFreq()));
+ writer.write(delimiter);
+ writer.write(String.valueOf(entry.getTermIdx()));
+ writer.write('\n');
+ }
+ } finally {
+ Closeables.close(writer, false);
+ }
+ }
+
+ /**
+ * Does NOT close the underlying writer
+ */
+ @Override
+ public void close() {
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
new file mode 100644
index 0000000..0d763a1
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.io;
+
+import java.io.IOException;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+
+/**
+ * Writes out Vectors to a SequenceFile.
+ *
+ * Closes the writer when done
+ */
+public class SequenceFileVectorWriter implements VectorWriter {
+ private final SequenceFile.Writer writer;
+ private long recNum = 0;
+ public SequenceFileVectorWriter(SequenceFile.Writer writer) {
+ this.writer = writer;
+ }
+
+ @Override
+ public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
+
+ for (Vector point : iterable) {
+ if (recNum >= maxDocs) {
+ break;
+ }
+ if (point != null) {
+ writer.append(new LongWritable(recNum++), new VectorWritable(point));
+ }
+
+ }
+ return recNum;
+ }
+
+ @Override
+ public void write(Vector vector) throws IOException {
+ writer.append(new LongWritable(recNum++), new VectorWritable(vector));
+
+ }
+
+ @Override
+ public long write(Iterable<Vector> iterable) throws IOException {
+ return write(iterable, Long.MAX_VALUE);
+ }
+
+ @Override
+ public void close() throws IOException {
+ Closeables.close(writer, false);
+ }
+
+ public SequenceFile.Writer getWriter() {
+ return writer;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
new file mode 100644
index 0000000..e165b45
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.io;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.mahout.utils.vectors.TermInfo;
+
+public interface TermInfoWriter extends Closeable {
+
+ void write(TermInfo ti) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
new file mode 100644
index 0000000..cc27d1d
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.io;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import com.google.common.io.Closeables;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Write out the vectors to any {@link Writer} using {@link Vector#asFormatString()},
+ * one per line by default.
+ */
+public class TextualVectorWriter implements VectorWriter {
+
+ private final Writer writer;
+
+ public TextualVectorWriter(Writer writer) {
+ this.writer = writer;
+ }
+
+ protected Writer getWriter() {
+ return writer;
+ }
+
+ @Override
+ public long write(Iterable<Vector> iterable) throws IOException {
+ return write(iterable, Long.MAX_VALUE);
+ }
+
+ @Override
+ public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
+ long result = 0;
+ for (Vector vector : iterable) {
+ if (result >= maxDocs) {
+ break;
+ }
+ write(vector);
+ result++;
+ }
+ return result;
+ }
+
+ @Override
+ public void write(Vector vector) throws IOException {
+ writer.write(vector.asFormatString());
+ writer.write('\n');
+ }
+
+ @Override
+ public void close() throws IOException {
+ Closeables.close(writer, false);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
new file mode 100644
index 0000000..923e270
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.io;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.mahout.math.Vector;
+
+public interface VectorWriter extends Closeable {
+ /**
+ * Write all values in the Iterable to the output
+ * @param iterable The {@link Iterable} to loop over
+ * @return the number of docs written
+ * @throws IOException if there was a problem writing
+ *
+ */
+ long write(Iterable<Vector> iterable) throws IOException;
+
+ /**
+ * Write out a vector
+ *
+ * @param vector The {@link org.apache.mahout.math.Vector} to write
+ * @throws IOException
+ */
+ void write(Vector vector) throws IOException;
+
+ /**
+ * Write the first {@code maxDocs} to the output.
+ * @param iterable The {@link Iterable} to loop over
+ * @param maxDocs the maximum number of docs to write
+ * @return The number of docs written
+ * @throws IOException if there was a problem writing
+ */
+ long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
new file mode 100644
index 0000000..ff61a70
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.Bump125;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.vectorizer.Weight;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+/**
+ * Iterate over a Lucene index, extracting term vectors.
+ * Subclasses define how much information to retrieve from the Lucene index.
+ */
+public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> {
+ private static final Logger log = LoggerFactory.getLogger(LuceneIterator.class);
+ protected final IndexReader indexReader;
+ protected final String field;
+ protected final TermInfo terminfo;
+ protected final double normPower;
+ protected final Weight weight;
+ protected final Bump125 bump = new Bump125();
+ protected int nextDocId;
+ protected int maxErrorDocs;
+ protected int numErrorDocs;
+ protected long nextLogRecord = bump.increment();
+ protected int skippedErrorMessages;
+
+ public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight,
+ double maxPercentErrorDocs, String field) {
+ this.terminfo = terminfo;
+ this.normPower = normPower;
+ this.indexReader = indexReader;
+
+ this.weight = weight;
+ this.nextDocId = 0;
+ this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
+ this.field = field;
+ }
+
+ /**
+ * Given the document name, derive a name for the vector. This may involve
+ * reading the document from Lucene and setting up any other state that the
+ * subclass wants. This will be called once for each document that the
+ * iterator processes.
+ * @param documentIndex the lucene document index.
+ * @return the name to store in the vector.
+ */
+ protected abstract String getVectorName(int documentIndex) throws IOException;
+
+ @Override
+ protected Vector computeNext() {
+ try {
+ int doc;
+ Terms termFreqVector;
+ String name;
+
+ do {
+ doc = this.nextDocId;
+ nextDocId++;
+
+ if (doc >= indexReader.maxDoc()) {
+ return endOfData();
+ }
+
+ termFreqVector = indexReader.getTermVector(doc, field);
+ name = getVectorName(doc);
+
+ if (termFreqVector == null) {
+ numErrorDocs++;
+ if (numErrorDocs >= maxErrorDocs) {
+ log.error("There are too many documents that do not have a term vector for {}", field);
+ throw new IllegalStateException("There are too many documents that do not have a term vector for "
+ + field);
+ }
+ if (numErrorDocs >= nextLogRecord) {
+ if (skippedErrorMessages == 0) {
+ log.warn("{} does not have a term vector for {}", name, field);
+ } else {
+ log.warn("{} documents do not have a term vector for {}", numErrorDocs, field);
+ }
+ nextLogRecord = bump.increment();
+ skippedErrorMessages = 0;
+ } else {
+ skippedErrorMessages++;
+ }
+ }
+ } while (termFreqVector == null);
+
+ // The loop exits with termFreqVector and name set.
+
+ TermsEnum te = termFreqVector.iterator();
+ BytesRef term;
+ TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo);
+ mapper.setExpectations(field, termFreqVector.size());
+ while ((term = te.next()) != null) {
+ mapper.map(term, (int) te.totalTermFreq());
+ }
+ Vector result = mapper.getVector();
+ if (result == null) {
+ // TODO is this right? last version would produce null in the iteration in this case, though it
+ // seems like that may not be desirable
+ return null;
+ }
+
+ if (normPower == LuceneIterable.NO_NORMALIZING) {
+ result = new NamedVector(result, name);
+ } else {
+ result = new NamedVector(result.normalize(normPower), name);
+ }
+ return result;
+ } catch (IOException ioe) {
+ throw new IllegalStateException(ioe);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
new file mode 100644
index 0000000..0b59ed6
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+
+
+/**
+ * Caches TermEntries from a single field. Materializes all values in the TermEnum to memory (much like FieldCache)
+ */
+public class CachedTermInfo implements TermInfo {
+
+ private final Map<String, TermEntry> termEntries;
+ private final String field;
+
+ public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
+ this.field = field;
+ Terms t = MultiFields.getTerms(reader, field);
+ TermsEnum te = t.iterator();
+
+ int numDocs = reader.numDocs();
+ double percent = numDocs * maxDfPercent / 100.0;
+ //Should we use a linked hash map so that we know terms are in order?
+ termEntries = new LinkedHashMap<>();
+ int count = 0;
+ BytesRef text;
+ while ((text = te.next()) != null) {
+ int df = te.docFreq();
+ if (df >= minDf && df <= percent) {
+ TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
+ termEntries.put(entry.getTerm(), entry);
+ }
+ }
+ }
+
+ @Override
+ public int totalTerms(String field) {
+ return termEntries.size();
+ }
+
+ @Override
+ public TermEntry getTermEntry(String field, String term) {
+ if (!this.field.equals(field)) {
+ return null;
+ }
+ return termEntries.get(term);
+ }
+
+ @Override
+ public Iterator<TermEntry> getAllEntries() {
+ return termEntries.values().iterator();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
new file mode 100644
index 0000000..b2568e7
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
@@ -0,0 +1,381 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.file.Paths;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.fs.Path;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.stats.LogLikelihood;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Get labels for the cluster using Log Likelihood Ratio (LLR).
+ * 
+ *"The most useful way to think of this (LLR) is as the percentage of in-cluster documents that have the
+ * feature (term) versus the percentage out, keeping in mind that both percentages are uncertain since we have
+ * only a sample of all possible documents." - Ted Dunning
+ * 
+ * More about LLR can be found at : http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
+ */
+public class ClusterLabels {
+
+ private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class);
+
+ public static final int DEFAULT_MIN_IDS = 50;
+ public static final int DEFAULT_MAX_LABELS = 25;
+
+ private final String indexDir;
+ private final String contentField;
+ private String idField;
+ private final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
+ private String output;
+ private final int minNumIds;
+ private final int maxLabels;
+
+ public ClusterLabels(Path seqFileDir,
+ Path pointsDir,
+ String indexDir,
+ String contentField,
+ int minNumIds,
+ int maxLabels) {
+ this.indexDir = indexDir;
+ this.contentField = contentField;
+ this.minNumIds = minNumIds;
+ this.maxLabels = maxLabels;
+ ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
+ this.clusterIdToPoints = clusterDumper.getClusterIdToPoints();
+ }
+
+ public void getLabels() throws IOException {
+
+ try (Writer writer = (this.output == null) ?
+ new OutputStreamWriter(System.out, Charsets.UTF_8) : Files.newWriter(new File(this.output), Charsets.UTF_8)){
+ for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) {
+ List<WeightedPropertyVectorWritable> wpvws = integerListEntry.getValue();
+ List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wpvws);
+ if (termInfos != null) {
+ writer.write('\n');
+ writer.write("Top labels for Cluster ");
+ writer.write(String.valueOf(integerListEntry.getKey()));
+ writer.write(" containing ");
+ writer.write(String.valueOf(wpvws.size()));
+ writer.write(" vectors");
+ writer.write('\n');
+ writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
+ writer.write('\n');
+ for (TermInfoClusterInOut termInfo : termInfos) {
+ writer.write(termInfo.getTerm());
+ writer.write("\t\t");
+ writer.write(String.valueOf(termInfo.getLogLikelihoodRatio()));
+ writer.write("\t\t");
+ writer.write(String.valueOf(termInfo.getInClusterDF()));
+ writer.write("\t\t");
+ writer.write(String.valueOf(termInfo.getOutClusterDF()));
+ writer.write('\n');
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Get the list of labels, sorted by best score.
+ */
+ protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
+ Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {
+
+ if (wpvws.size() < minNumIds) {
+ log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
+ return null;
+ }
+
+ log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
+ Directory dir = FSDirectory.open(Paths.get(this.indexDir));
+ IndexReader reader = DirectoryReader.open(dir);
+
+
+ log.info("# of documents in the index {}", reader.numDocs());
+
+ Collection<String> idSet = new HashSet<>();
+ for (WeightedPropertyVectorWritable wpvw : wpvws) {
+ Vector vector = wpvw.getVector();
+ if (vector instanceof NamedVector) {
+ idSet.add(((NamedVector) vector).getName());
+ }
+ }
+
+ int numDocs = reader.numDocs();
+
+ FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
+
+ log.info("Populating term infos from the index");
+
+ /**
+ * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
+ *
+ * Since we have deleted the documents out of the cluster, the document frequency for a term should only
+ * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
+ * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
+ * frequencies in each document. The number of results of this call will be the in-cluster document
+ * frequency.
+ */
+ Terms t = MultiFields.getTerms(reader, contentField);
+ TermsEnum te = t.iterator();
+ Map<String, TermEntry> termEntryMap = new LinkedHashMap<>();
+ Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions
+
+
+ int count = 0;
+ BytesRef term;
+ while ((term = te.next()) != null) {
+ FixedBitSet termBitset = new FixedBitSet(reader.maxDoc());
+ PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, contentField, term);
+ int docID;
+ while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+ //check to see if we don't have an deletions (null) or if document is live
+ if (liveDocs != null && !liveDocs.get(docID)) {
+ // document is deleted...
+ termBitset.set(docsEnum.docID());
+ }
+ }
+ // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
+ // This modifies the termBitset, but that's fine as we are not using it anywhere else.
+ termBitset.and(clusterDocBitset);
+ int inclusterDF = (int) termBitset.cardinality();
+
+ TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
+ termEntryMap.put(entry.getTerm(), entry);
+
+ }
+
+ List<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<>();
+
+ int clusterSize = wpvws.size();
+
+ for (TermEntry termEntry : termEntryMap.values()) {
+
+ int corpusDF = reader.docFreq(new Term(this.contentField,termEntry.getTerm()));
+ int outDF = corpusDF - termEntry.getDocFreq();
+ int inDF = termEntry.getDocFreq();
+ double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
+ TermInfoClusterInOut termInfoCluster =
+ new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio);
+ clusteredTermInfo.add(termInfoCluster);
+ }
+
+ Collections.sort(clusteredTermInfo);
+ // Cleanup
+ Closeables.close(reader, true);
+ termEntryMap.clear();
+
+ return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
+ }
+
+ private static FixedBitSet getClusterDocBitset(IndexReader reader,
+ Collection<String> idSet,
+ String idField) throws IOException {
+ int numDocs = reader.numDocs();
+
+ FixedBitSet bitset = new FixedBitSet(numDocs);
+
+ Set<String> idFieldSelector = null;
+ if (idField != null) {
+ idFieldSelector = new TreeSet<>();
+ idFieldSelector.add(idField);
+ }
+
+
+ for (int i = 0; i < numDocs; i++) {
+ String id;
+ // Use Lucene's internal ID if idField is not specified. Else, get it from the document.
+ if (idField == null) {
+ id = Integer.toString(i);
+ } else {
+ id = reader.document(i, idFieldSelector).get(idField);
+ }
+ if (idSet.contains(id)) {
+ bitset.set(i);
+ }
+ }
+ log.info("Created bitset for in-cluster documents : {}", bitset.cardinality());
+ return bitset;
+ }
+
+ private static double scoreDocumentFrequencies(long inDF, long outDF, long clusterSize, long corpusSize) {
+ long k12 = clusterSize - inDF;
+ long k22 = corpusSize - clusterSize - outDF;
+
+ return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
+ }
+
+ public String getIdField() {
+ return idField;
+ }
+
+ public void setIdField(String idField) {
+ this.idField = idField;
+ }
+
+ public String getOutput() {
+ return output;
+ }
+
+ public void setOutput(String output) {
+ this.output = output;
+ }
+
+ public static void main(String[] args) {
+
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option indexOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
+ abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
+ .withDescription("The Lucene index directory").withShortName("d").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output file. If not specified, the result is printed on console.").withShortName("o").create();
+
+ Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
+ abuilder.withName("field").withMinimum(1).withMaximum(1).create())
+ .withDescription("The content field in the index").withShortName("f").create();
+
+ Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
+ abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The field for the document ID in the index. If null, then the Lucene internal doc "
+ + "id is used which is prone to error if the underlying index changes").withShortName("i").create();
+
+ Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(true).withArgument(
+ abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The directory containing Sequence Files for the Clusters").withShortName("s").create();
+
+ Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true).withArgument(
+ abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The directory containing points sequence files mapping input vectors to their cluster. ")
+ .withShortName("p").create();
+ Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false).withArgument(
+ abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The minimum number of points required in a cluster to print the labels for").withShortName("m").create();
+ Option maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false).withArgument(
+ abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The maximum number of labels to print per cluster").withShortName("x").create();
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt)
+ .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt)
+ .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString());
+ Path pointsDir = new Path(cmdLine.getValue(pointsOpt).toString());
+ String indexDir = cmdLine.getValue(indexOpt).toString();
+ String contentField = cmdLine.getValue(fieldOpt).toString();
+
+ String idField = null;
+
+ if (cmdLine.hasOption(idFieldOpt)) {
+ idField = cmdLine.getValue(idFieldOpt).toString();
+ }
+ String output = null;
+ if (cmdLine.hasOption(outputOpt)) {
+ output = cmdLine.getValue(outputOpt).toString();
+ }
+ int maxLabels = DEFAULT_MAX_LABELS;
+ if (cmdLine.hasOption(maxLabelsOpt)) {
+ maxLabels = Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString());
+ }
+ int minSize = DEFAULT_MIN_IDS;
+ if (cmdLine.hasOption(minClusterSizeOpt)) {
+ minSize = Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString());
+ }
+ ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField, minSize, maxLabels);
+
+ if (idField != null) {
+ clusterLabel.setIdField(idField);
+ }
+ if (output != null) {
+ clusterLabel.setOutput(output);
+ }
+
+ clusterLabel.getLabels();
+
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ } catch (IOException e) {
+ log.error("Exception", e);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
new file mode 100644
index 0000000..876816f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
@@ -0,0 +1,349 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Writer;
+import java.nio.file.Paths;
+import java.util.Iterator;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Files;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.utils.vectors.io.DelimitedTermInfoWriter;
+import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
+import org.apache.mahout.utils.vectors.io.VectorWriter;
+import org.apache.mahout.vectorizer.TF;
+import org.apache.mahout.vectorizer.TFIDF;
+import org.apache.mahout.vectorizer.Weight;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Driver {
+
+ private static final Logger log = LoggerFactory.getLogger(Driver.class);
+
+ private String luceneDir;
+ private String outFile;
+ private String field;
+ private String idField;
+ private String dictOut;
+ private String seqDictOut = "";
+ private String weightType = "tfidf";
+ private String delimiter = "\t";
+ private double norm = LuceneIterable.NO_NORMALIZING;
+ private long maxDocs = Long.MAX_VALUE;
+ private int minDf = 1;
+ private int maxDFPercent = 99;
+ private double maxPercentErrorDocs = 0.0;
+
+ public void dumpVectors() throws IOException {
+
+ File file = new File(luceneDir);
+ Preconditions.checkArgument(file.isDirectory(),
+ "Lucene directory: " + file.getAbsolutePath()
+ + " does not exist or is not a directory");
+ Preconditions.checkArgument(maxDocs >= 0, "maxDocs must be >= 0");
+ Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1");
+ Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99");
+
+ Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath()));
+ IndexReader reader = DirectoryReader.open(dir);
+
+
+ Weight weight;
+ if ("tf".equalsIgnoreCase(weightType)) {
+ weight = new TF();
+ } else if ("tfidf".equalsIgnoreCase(weightType)) {
+ weight = new TFIDF();
+ } else {
+ throw new IllegalArgumentException("Weight type " + weightType + " is not supported");
+ }
+
+ TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
+
+ LuceneIterable iterable;
+ if (norm == LuceneIterable.NO_NORMALIZING) {
+ iterable = new LuceneIterable(reader, idField, field, termInfo, weight, LuceneIterable.NO_NORMALIZING,
+ maxPercentErrorDocs);
+ } else {
+ iterable = new LuceneIterable(reader, idField, field, termInfo, weight, norm, maxPercentErrorDocs);
+ }
+
+ log.info("Output File: {}", outFile);
+
+ try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) {
+ long numDocs = vectorWriter.write(iterable, maxDocs);
+ log.info("Wrote: {} vectors", numDocs);
+ }
+
+ File dictOutFile = new File(dictOut);
+ log.info("Dictionary Output file: {}", dictOutFile);
+ Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
+ try (DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field)) {
+ tiWriter.write(termInfo);
+ }
+
+ if (!"".equals(seqDictOut)) {
+ log.info("SequenceFile Dictionary Output file: {}", seqDictOut);
+
+ Path path = new Path(seqDictOut);
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ try (SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class)) {
+ Text term = new Text();
+ IntWritable termIndex = new IntWritable();
+ Iterator<TermEntry> termEntries = termInfo.getAllEntries();
+ while (termEntries.hasNext()) {
+ TermEntry termEntry = termEntries.next();
+ term.set(termEntry.getTerm());
+ termIndex.set(termEntry.getTermIdx());
+ seqWriter.append(term, termIndex);
+ }
+ }
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
+ abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
+ .withDescription("The Lucene directory").withShortName("d").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file")
+ .withShortName("o").create();
+
+ Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
+ abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The field in the index").withShortName("f").create();
+
+ Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
+ abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The field in the index containing the index. If null, then the Lucene internal doc "
+ + "id is used which is prone to error if the underlying index changes").create();
+
+ Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
+ abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output of the dictionary").withShortName("t").create();
+
+ Option seqDictOutOpt = obuilder.withLongName("seqDictOut").withRequired(false).withArgument(
+ abuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output of the dictionary as sequence file").withShortName("st").create();
+
+ Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
+ abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
+
+ Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
+ abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The delimiter for outputting the dictionary").withShortName("l").create();
+
+ Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
+ abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm. "
+ + "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create();
+
+ Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
+ abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The maximum number of vectors to output. If not specified, then it will loop over all docs")
+ .withShortName("m").create();
+
+ Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
+ abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The minimum document frequency. Default is 1").withShortName("md").create();
+
+ Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
+ abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The max percentage of docs for the DF. Can be used to remove really high frequency terms."
+ + " Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
+
+ Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
+ abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The max percentage of docs that can have a null term vector. These are noise document and can occur if the "
+ + "analyzer used strips out all terms in the target field. This percentage is expressed as a value "
+ + "between 0 and 1. The default is 0.").withShortName("err").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
+ outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
+ .withOption(dictOutOpt).withOption(seqDictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
+ .withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ if (cmdLine.hasOption(inputOpt)) { // Lucene case
+ Driver luceneDriver = new Driver();
+ luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString());
+
+ if (cmdLine.hasOption(maxOpt)) {
+ luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString()));
+ }
+
+ if (cmdLine.hasOption(weightOpt)) {
+ luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString());
+ }
+
+ luceneDriver.setField(cmdLine.getValue(fieldOpt).toString());
+
+ if (cmdLine.hasOption(minDFOpt)) {
+ luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString()));
+ }
+
+ if (cmdLine.hasOption(maxDFPercentOpt)) {
+ luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()));
+ }
+
+ if (cmdLine.hasOption(powerOpt)) {
+ String power = cmdLine.getValue(powerOpt).toString();
+ if ("INF".equals(power)) {
+ luceneDriver.setNorm(Double.POSITIVE_INFINITY);
+ } else {
+ luceneDriver.setNorm(Double.parseDouble(power));
+ }
+ }
+
+ if (cmdLine.hasOption(idFieldOpt)) {
+ luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString());
+ }
+
+ if (cmdLine.hasOption(maxPercentErrorDocsOpt)) {
+ luceneDriver.setMaxPercentErrorDocs(Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString()));
+ }
+
+ luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString());
+
+ luceneDriver.setDelimiter(cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t");
+
+ luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString());
+
+ if (cmdLine.hasOption(seqDictOutOpt)) {
+ luceneDriver.setSeqDictOut(cmdLine.getValue(seqDictOutOpt).toString());
+ }
+
+ luceneDriver.dumpVectors();
+ }
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+
+ private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
+ Path path = new Path(outFile);
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ // TODO: Make this parameter driven
+
+ SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
+ VectorWritable.class);
+
+ return new SequenceFileVectorWriter(seqWriter);
+ }
+
+ public void setLuceneDir(String luceneDir) {
+ this.luceneDir = luceneDir;
+ }
+
+ public void setMaxDocs(long maxDocs) {
+ this.maxDocs = maxDocs;
+ }
+
+ public void setWeightType(String weightType) {
+ this.weightType = weightType;
+ }
+
+ public void setField(String field) {
+ this.field = field;
+ }
+
+ public void setMinDf(int minDf) {
+ this.minDf = minDf;
+ }
+
+ public void setMaxDFPercent(int maxDFPercent) {
+ this.maxDFPercent = maxDFPercent;
+ }
+
+ public void setNorm(double norm) {
+ this.norm = norm;
+ }
+
+ public void setIdField(String idField) {
+ this.idField = idField;
+ }
+
+ public void setOutFile(String outFile) {
+ this.outFile = outFile;
+ }
+
+ public void setDelimiter(String delimiter) {
+ this.delimiter = delimiter;
+ }
+
+ public void setDictOut(String dictOut) {
+ this.dictOut = dictOut;
+ }
+
+ public void setSeqDictOut(String seqDictOut) {
+ this.seqDictOut = seqDictOut;
+ }
+
+ public void setMaxPercentErrorDocs(double maxPercentErrorDocs) {
+ this.maxPercentErrorDocs = maxPercentErrorDocs;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
new file mode 100644
index 0000000..1af0ed0
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.vectorizer.Weight;
+
+import java.util.Iterator;
+
+/**
+ * {@link Iterable} counterpart to {@link LuceneIterator}.
+ */
+public final class LuceneIterable implements Iterable<Vector> {
+
+ public static final double NO_NORMALIZING = -1.0;
+
+ private final IndexReader indexReader;
+ private final String field;
+ private final String idField;
+ private final TermInfo terminfo;
+ private final double normPower;
+ private final double maxPercentErrorDocs;
+ private final Weight weight;
+
+ public LuceneIterable(IndexReader reader, String idField, String field, TermInfo terminfo, Weight weight) {
+ this(reader, idField, field, terminfo, weight, NO_NORMALIZING);
+ }
+
+ public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight,
+ double normPower) {
+ this(indexReader, idField, field, terminfo, weight, normPower, 0);
+ }
+
+ /**
+ * Produce a LuceneIterable that can create the Vector plus normalize it.
+ *
+ * @param indexReader {@link org.apache.lucene.index.IndexReader} to read the documents from.
+ * @param idField field containing the id. May be null.
+ * @param field field to use for the Vector
+ * @param normPower the normalization value. Must be nonnegative, or {@link #NO_NORMALIZING}
+ * @param maxPercentErrorDocs the percentage of documents in the lucene index that can have a null term vector
+ */
+ public LuceneIterable(IndexReader indexReader,
+ String idField,
+ String field,
+ TermInfo terminfo,
+ Weight weight,
+ double normPower,
+ double maxPercentErrorDocs) {
+ this.indexReader = indexReader;
+ this.idField = idField;
+ this.field = field;
+ this.terminfo = terminfo;
+ this.normPower = normPower;
+ this.maxPercentErrorDocs = maxPercentErrorDocs;
+ this.weight = weight;
+ }
+
+ @Override
+ public Iterator<Vector> iterator() {
+ return new LuceneIterator(indexReader, idField, field, terminfo, weight, normPower, maxPercentErrorDocs);
+ }
+}

r***@apache.org

2018-06-27 14:52:14 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
new file mode 100644
index 0000000..757f38c
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class ClusterEvaluator {
+
+ private static final Logger log = LoggerFactory.getLogger(ClusterEvaluator.class);
+
+ private final Map<Integer,List<VectorWritable>> representativePoints;
+
+ private final List<Cluster> clusters;
+
+ private final DistanceMeasure measure;
+
+ /**
+ * For testing only
+ *
+ * @param representativePoints
+ * a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
+ * @param clusters
+ * a Map<Integer,Cluster> of the clusters keyed by clusterId
+ * @param measure
+ * an appropriate DistanceMeasure
+ */
+ public ClusterEvaluator(Map<Integer,List<VectorWritable>> representativePoints, List<Cluster> clusters,
+ DistanceMeasure measure) {
+ this.representativePoints = representativePoints;
+ this.clusters = clusters;
+ this.measure = measure;
+ }
+
+ /**
+ * Initialize a new instance from job information
+ *
+ * @param conf
+ * a Configuration with appropriate parameters
+ * @param clustersIn
+ * a String path to the input clusters directory
+ */
+ public ClusterEvaluator(Configuration conf, Path clustersIn) {
+ measure = ClassUtils
+ .instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
+ representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
+ clusters = loadClusters(conf, clustersIn);
+ }
+
+ /**
+ * Load the clusters from their sequence files
+ *
+ * @param clustersIn
+ * a String pathname to the directory containing input cluster files
+ * @return a List<Cluster> of the clusters
+ */
+ private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) {
+ List<Cluster> clusters = new ArrayList<>();
+ for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
+ PathFilters.logsCRCFilter(), conf)) {
+ Cluster cluster = clusterWritable.getValue();
+ clusters.add(cluster);
+ }
+ return clusters;
+ }
+
+ /**
+ * Computes the inter-cluster density as defined in "Mahout In Action"
+ *
+ * @return the interClusterDensity
+ */
+ public double interClusterDensity() {
+ double max = Double.NEGATIVE_INFINITY;
+ double min = Double.POSITIVE_INFINITY;
+ double sum = 0;
+ int count = 0;
+ Map<Integer,Vector> distances = interClusterDistances();
+ for (Vector row : distances.values()) {
+ for (Element element : row.nonZeroes()) {
+ double d = element.get();
+ min = Math.min(d, min);
+ max = Math.max(d, max);
+ sum += d;
+ count++;
+ }
+ }
+ double density = (sum / count - min) / (max - min);
+ log.info("Scaled Inter-Cluster Density = {}", density);
+ return density;
+ }
+
+ /**
+ * Computes the inter-cluster distances
+ *
+ * @return a Map<Integer, Vector>
+ */
+ public Map<Integer,Vector> interClusterDistances() {
+ Map<Integer,Vector> distances = new TreeMap<>();
+ for (int i = 0; i < clusters.size(); i++) {
+ Cluster clusterI = clusters.get(i);
+ RandomAccessSparseVector row = new RandomAccessSparseVector(Integer.MAX_VALUE);
+ distances.put(clusterI.getId(), row);
+ for (int j = i + 1; j < clusters.size(); j++) {
+ Cluster clusterJ = clusters.get(j);
+ double d = measure.distance(clusterI.getCenter(), clusterJ.getCenter());
+ row.set(clusterJ.getId(), d);
+ }
+ }
+ return distances;
+ }
+
+ /**
+ * Computes the average intra-cluster density as the average of each cluster's intra-cluster density
+ *
+ * @return the average intraClusterDensity
+ */
+ public double intraClusterDensity() {
+ double avgDensity = 0;
+ int count = 0;
+ for (Element elem : intraClusterDensities().nonZeroes()) {
+ double value = elem.get();
+ if (!Double.isNaN(value)) {
+ avgDensity += value;
+ count++;
+ }
+ }
+ avgDensity = clusters.isEmpty() ? 0 : avgDensity / count;
+ log.info("Average Intra-Cluster Density = {}", avgDensity);
+ return avgDensity;
+ }
+
+ /**
+ * Computes the intra-cluster densities for all clusters as the average distance of the representative points from
+ * each other
+ *
+ * @return a Vector of the intraClusterDensity of the representativePoints by clusterId
+ */
+ public Vector intraClusterDensities() {
+ Vector densities = new RandomAccessSparseVector(Integer.MAX_VALUE);
+ for (Cluster cluster : clusters) {
+ int count = 0;
+ double max = Double.NEGATIVE_INFINITY;
+ double min = Double.POSITIVE_INFINITY;
+ double sum = 0;
+ List<VectorWritable> repPoints = representativePoints.get(cluster.getId());
+ for (int i = 0; i < repPoints.size(); i++) {
+ for (int j = i + 1; j < repPoints.size(); j++) {
+ Vector v1 = repPoints.get(i).get();
+ Vector v2 = repPoints.get(j).get();
+ double d = measure.distance(v1, v2);
+ min = Math.min(d, min);
+ max = Math.max(d, max);
+ sum += d;
+ count++;
+ }
+ }
+ double density = (sum / count - min) / (max - min);
+ densities.set(cluster.getId(), density);
+ log.info("Intra-Cluster Density[{}] = {}", cluster.getId(), density);
+ }
+ return densities;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
new file mode 100644
index 0000000..2fe37ef
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
@@ -0,0 +1,243 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class RepresentativePointsDriver extends AbstractJob {
+
+ public static final String STATE_IN_KEY = "org.apache.mahout.clustering.stateIn";
+
+ public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.measure";
+
+ private static final Logger log = LoggerFactory.getLogger(RepresentativePointsDriver.class);
+
+ private RepresentativePointsDriver() {}
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new RepresentativePointsDriver(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
+ addInputOption();
+ addOutputOption();
+ addOption("clusteredPoints", "cp", "The path to the clustered points", true);
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.methodOption().create());
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+ boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+ DefaultOptionCreator.SEQUENTIAL_METHOD);
+ DistanceMeasure measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class);
+ Path clusteredPoints = new Path(getOption("clusteredPoints"));
+ run(getConf(), input, clusteredPoints, output, measure, maxIterations, runSequential);
+ return 0;
+ }
+
+ /**
+ * Utility to print out representative points
+ *
+ * @param output
+ * the Path to the directory containing representativePoints-i folders
+ * @param numIterations
+ * the int number of iterations to print
+ */
+ public static void printRepresentativePoints(Path output, int numIterations) {
+ for (int i = 0; i <= numIterations; i++) {
+ Path out = new Path(output, "representativePoints-" + i);
+ System.out.println("Representative Points for iteration " + i);
+ Configuration conf = new Configuration();
+ for (Pair<IntWritable,VectorWritable> record : new SequenceFileDirIterable<IntWritable,VectorWritable>(out,
+ PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) {
+ System.out.println("\tC-" + record.getFirst().get() + ": "
+ + AbstractCluster.formatVector(record.getSecond().get(), null));
+ }
+ }
+ }
+
+ public static void run(Configuration conf, Path clustersIn, Path clusteredPointsIn, Path output,
+ DistanceMeasure measure, int numIterations, boolean runSequential) throws IOException, InterruptedException,
+ ClassNotFoundException {
+ Path stateIn = new Path(output, "representativePoints-0");
+ writeInitialState(stateIn, clustersIn);
+
+ for (int iteration = 0; iteration < numIterations; iteration++) {
+ log.info("Representative Points Iteration {}", iteration);
+ // point the output to a new directory per iteration
+ Path stateOut = new Path(output, "representativePoints-" + (iteration + 1));
+ runIteration(conf, clusteredPointsIn, stateIn, stateOut, measure, runSequential);
+ // now point the input to the old output directory
+ stateIn = stateOut;
+ }
+
+ conf.set(STATE_IN_KEY, stateIn.toString());
+ conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
+ }
+
+ private static void writeInitialState(Path output, Path clustersIn) throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ for (FileStatus dir : fs.globStatus(clustersIn)) {
+ Path inPath = dir.getPath();
+ for (FileStatus part : fs.listStatus(inPath, PathFilters.logsCRCFilter())) {
+ Path inPart = part.getPath();
+ Path path = new Path(output, inPart.getName());
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class)){
+ for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(inPart, true, conf)) {
+ Cluster cluster = clusterWritable.getValue();
+ if (log.isDebugEnabled()) {
+ log.debug("C-{}: {}", cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null));
+ }
+ writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
+ }
+ }
+ }
+ }
+ }
+
+ private static void runIteration(Configuration conf, Path clusteredPointsIn, Path stateIn, Path stateOut,
+ DistanceMeasure measure, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
+ if (runSequential) {
+ runIterationSeq(conf, clusteredPointsIn, stateIn, stateOut, measure);
+ } else {
+ runIterationMR(conf, clusteredPointsIn, stateIn, stateOut, measure);
+ }
+ }
+
+ /**
+ * Run the job using supplied arguments as a sequential process
+ *
+ * @param conf
+ * the Configuration to use
+ * @param clusteredPointsIn
+ * the directory pathname for input points
+ * @param stateIn
+ * the directory pathname for input state
+ * @param stateOut
+ * the directory pathname for output state
+ * @param measure
+ * the DistanceMeasure to use
+ */
+ private static void runIterationSeq(Configuration conf, Path clusteredPointsIn, Path stateIn, Path stateOut,
+ DistanceMeasure measure) throws IOException {
+
+ Map<Integer,List<VectorWritable>> repPoints = RepresentativePointsMapper.getRepresentativePoints(conf, stateIn);
+ Map<Integer,WeightedVectorWritable> mostDistantPoints = new HashMap<>();
+ FileSystem fs = FileSystem.get(clusteredPointsIn.toUri(), conf);
+ for (Pair<IntWritable,WeightedVectorWritable> record
+ : new SequenceFileDirIterable<IntWritable,WeightedVectorWritable>(clusteredPointsIn, PathType.LIST,
+ PathFilters.logsCRCFilter(), null, true, conf)) {
+ RepresentativePointsMapper.mapPoint(record.getFirst(), record.getSecond(), measure, repPoints, mostDistantPoints);
+ }
+ int part = 0;
+ try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(stateOut, "part-m-" + part++),
+ IntWritable.class, VectorWritable.class)){
+ for (Entry<Integer,List<VectorWritable>> entry : repPoints.entrySet()) {
+ for (VectorWritable vw : entry.getValue()) {
+ writer.append(new IntWritable(entry.getKey()), vw);
+ }
+ }
+ }
+ try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(stateOut, "part-m-" + part++),
+ IntWritable.class, VectorWritable.class)){
+ for (Map.Entry<Integer,WeightedVectorWritable> entry : mostDistantPoints.entrySet()) {
+ writer.append(new IntWritable(entry.getKey()), new VectorWritable(entry.getValue().getVector()));
+ }
+ }
+ }
+
+ /**
+ * Run the job using supplied arguments as a Map/Reduce process
+ *
+ * @param conf
+ * the Configuration to use
+ * @param input
+ * the directory pathname for input points
+ * @param stateIn
+ * the directory pathname for input state
+ * @param stateOut
+ * the directory pathname for output state
+ * @param measure
+ * the DistanceMeasure to use
+ */
+ private static void runIterationMR(Configuration conf, Path input, Path stateIn, Path stateOut,
+ DistanceMeasure measure) throws IOException, InterruptedException, ClassNotFoundException {
+ conf.set(STATE_IN_KEY, stateIn.toString());
+ conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
+ Job job = new Job(conf, "Representative Points Driver running over input: " + input);
+ job.setJarByClass(RepresentativePointsDriver.class);
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(VectorWritable.class);
+ job.setMapOutputKeyClass(IntWritable.class);
+ job.setMapOutputValueClass(WeightedVectorWritable.class);
+
+ FileInputFormat.setInputPaths(job, input);
+ FileOutputFormat.setOutputPath(job, stateOut);
+
+ job.setMapperClass(RepresentativePointsMapper.class);
+ job.setReducerClass(RepresentativePointsReducer.class);
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
new file mode 100644
index 0000000..0ae79ad
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VectorWritable;
+
+public class RepresentativePointsMapper
+ extends Mapper<IntWritable, WeightedVectorWritable, IntWritable, WeightedVectorWritable> {
+
+ private Map<Integer, List<VectorWritable>> representativePoints;
+ private final Map<Integer, WeightedVectorWritable> mostDistantPoints = new HashMap<>();
+ private DistanceMeasure measure = new EuclideanDistanceMeasure();
+
+ @Override
+ protected void cleanup(Context context) throws IOException, InterruptedException {
+ for (Map.Entry<Integer, WeightedVectorWritable> entry : mostDistantPoints.entrySet()) {
+ context.write(new IntWritable(entry.getKey()), entry.getValue());
+ }
+ super.cleanup(context);
+ }
+
+ @Override
+ protected void map(IntWritable clusterId, WeightedVectorWritable point, Context context)
+ throws IOException, InterruptedException {
+ mapPoint(clusterId, point, measure, representativePoints, mostDistantPoints);
+ }
+
+ public static void mapPoint(IntWritable clusterId,
+ WeightedVectorWritable point,
+ DistanceMeasure measure,
+ Map<Integer, List<VectorWritable>> representativePoints,
+ Map<Integer, WeightedVectorWritable> mostDistantPoints) {
+ int key = clusterId.get();
+ WeightedVectorWritable currentMDP = mostDistantPoints.get(key);
+
+ List<VectorWritable> repPoints = representativePoints.get(key);
+ double totalDistance = 0.0;
+ if (repPoints != null) {
+ for (VectorWritable refPoint : repPoints) {
+ totalDistance += measure.distance(refPoint.get(), point.getVector());
+ }
+ }
+ if (currentMDP == null || currentMDP.getWeight() < totalDistance) {
+ mostDistantPoints.put(key, new WeightedVectorWritable(totalDistance, point.getVector().clone()));
+ }
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Configuration conf = context.getConfiguration();
+ measure =
+ ClassUtils.instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
+ representativePoints = getRepresentativePoints(conf);
+ }
+
+ public void configure(Map<Integer, List<VectorWritable>> referencePoints, DistanceMeasure measure) {
+ this.representativePoints = referencePoints;
+ this.measure = measure;
+ }
+
+ public static Map<Integer, List<VectorWritable>> getRepresentativePoints(Configuration conf) {
+ String statePath = conf.get(RepresentativePointsDriver.STATE_IN_KEY);
+ return getRepresentativePoints(conf, new Path(statePath));
+ }
+
+ public static Map<Integer, List<VectorWritable>> getRepresentativePoints(Configuration conf, Path statePath) {
+ Map<Integer, List<VectorWritable>> representativePoints = new HashMap<>();
+ for (Pair<IntWritable,VectorWritable> record
+ : new SequenceFileDirIterable<IntWritable,VectorWritable>(statePath,
+ PathType.LIST,
+ PathFilters.logsCRCFilter(),
+ conf)) {
+ int keyValue = record.getFirst().get();
+ List<VectorWritable> repPoints = representativePoints.get(keyValue);
+ if (repPoints == null) {
+ repPoints = new ArrayList<>();
+ representativePoints.put(keyValue, repPoints);
+ }
+ repPoints.add(record.getSecond());
+ }
+ return representativePoints;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
new file mode 100644
index 0000000..27ca861
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.math.VectorWritable;
+
+public class RepresentativePointsReducer
+ extends Reducer<IntWritable, WeightedVectorWritable, IntWritable, VectorWritable> {
+
+ private Map<Integer, List<VectorWritable>> representativePoints;
+
+ @Override
+ protected void cleanup(Context context) throws IOException, InterruptedException {
+ for (Map.Entry<Integer, List<VectorWritable>> entry : representativePoints.entrySet()) {
+ IntWritable iw = new IntWritable(entry.getKey());
+ for (VectorWritable vw : entry.getValue()) {
+ context.write(iw, vw);
+ }
+ }
+ super.cleanup(context);
+ }
+
+ @Override
+ protected void reduce(IntWritable key, Iterable<WeightedVectorWritable> values, Context context)
+ throws IOException, InterruptedException {
+ // find the most distant point
+ WeightedVectorWritable mdp = null;
+ for (WeightedVectorWritable dpw : values) {
+ if (mdp == null || mdp.getWeight() < dpw.getWeight()) {
+ mdp = new WeightedVectorWritable(dpw.getWeight(), dpw.getVector());
+ }
+ }
+ context.write(new IntWritable(key.get()), new VectorWritable(mdp.getVector()));
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Configuration conf = context.getConfiguration();
+ representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
+ }
+
+ public void configure(Map<Integer, List<VectorWritable>> representativePoints) {
+ this.representativePoints = representativePoints;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
new file mode 100644
index 0000000..392909e
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
@@ -0,0 +1,229 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.lda;
+
+import com.google.common.io.Closeables;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.Queue;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.IntPairWritable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.utils.vectors.VectorHelper;
+
+/**
+ * Class to print out the top K words for each topic.
+ */
+public final class LDAPrintTopics {
+
+ private LDAPrintTopics() { }
+
+ // Expands the queue list to have a Queue for topic K
+ private static void ensureQueueSize(Collection<Queue<Pair<String,Double>>> queues, int k) {
+ for (int i = queues.size(); i <= k; ++i) {
+ queues.add(new PriorityQueue<Pair<String,Double>>());
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = DefaultOptionCreator.inputOption().create();
+
+ Option dictOpt = obuilder.withLongName("dict").withRequired(true).withArgument(
+ abuilder.withName("dict").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Dictionary to read in, in the same format as one created by "
+ + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName("d").create();
+
+ Option outOpt = DefaultOptionCreator.outputOption().create();
+
+ Option wordOpt = obuilder.withLongName("words").withRequired(false).withArgument(
+ abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault("20").create()).withDescription(
+ "Number of words to print").withShortName("w").create();
+ Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
+ abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The dictionary file type (text|sequencefile)").withShortName("dt").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(outOpt).withOption(wordOpt)
+ .withOption(inputOpt).withOption(dictTypeOpt).create();
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String input = cmdLine.getValue(inputOpt).toString();
+ String dictFile = cmdLine.getValue(dictOpt).toString();
+ int numWords = 20;
+ if (cmdLine.hasOption(wordOpt)) {
+ numWords = Integer.parseInt(cmdLine.getValue(wordOpt).toString());
+ }
+ Configuration config = new Configuration();
+
+ String dictionaryType = "text";
+ if (cmdLine.hasOption(dictTypeOpt)) {
+ dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
+ }
+
+ List<String> wordList;
+ if ("text".equals(dictionaryType)) {
+ wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(dictFile)));
+ } else if ("sequencefile".equals(dictionaryType)) {
+ wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, dictFile));
+ } else {
+ throw new IllegalArgumentException("Invalid dictionary format");
+ }
+
+ List<Queue<Pair<String,Double>>> topWords = topWordsForTopics(input, config, wordList, numWords);
+
+ File output = null;
+ if (cmdLine.hasOption(outOpt)) {
+ output = new File(cmdLine.getValue(outOpt).toString());
+ if (!output.exists() && !output.mkdirs()) {
+ throw new IOException("Could not create directory: " + output);
+ }
+ }
+ printTopWords(topWords, output);
+ } catch (OptionException e) {
+ CommandLineUtil.printHelp(group);
+ throw e;
+ }
+ }
+
+ // Adds the word if the queue is below capacity, or the score is high enough
+ private static void maybeEnqueue(Queue<Pair<String,Double>> q, String word, double score, int numWordsToPrint) {
+ if (q.size() >= numWordsToPrint && score > q.peek().getSecond()) {
+ q.poll();
+ }
+ if (q.size() < numWordsToPrint) {
+ q.add(new Pair<>(word, score));
+ }
+ }
+
+ private static void printTopWords(List<Queue<Pair<String,Double>>> topWords, File outputDir)
+ throws IOException {
+ for (int i = 0; i < topWords.size(); ++i) {
+ Collection<Pair<String,Double>> topK = topWords.get(i);
+ Writer out = null;
+ boolean printingToSystemOut = false;
+ try {
+ if (outputDir != null) {
+ out = new OutputStreamWriter(new FileOutputStream(new File(outputDir, "topic_" + i)), Charsets.UTF_8);
+ } else {
+ out = new OutputStreamWriter(System.out, Charsets.UTF_8);
+ printingToSystemOut = true;
+ out.write("Topic " + i);
+ out.write('\n');
+ out.write("===========");
+ out.write('\n');
+ }
+ List<Pair<String,Double>> topKasList = new ArrayList<>(topK.size());
+ for (Pair<String,Double> wordWithScore : topK) {
+ topKasList.add(wordWithScore);
+ }
+ Collections.sort(topKasList, new Comparator<Pair<String,Double>>() {
+ @Override
+ public int compare(Pair<String,Double> pair1, Pair<String,Double> pair2) {
+ return pair2.getSecond().compareTo(pair1.getSecond());
+ }
+ });
+ for (Pair<String,Double> wordWithScore : topKasList) {
+ out.write(wordWithScore.getFirst() + " [p(" + wordWithScore.getFirst() + "|topic_" + i + ") = "
+ + wordWithScore.getSecond());
+ out.write('\n');
+ }
+ } finally {
+ if (!printingToSystemOut) {
+ Closeables.close(out, false);
+ } else {
+ out.flush();
+ }
+ }
+ }
+ }
+
+ private static List<Queue<Pair<String,Double>>> topWordsForTopics(String dir,
+ Configuration job,
+ List<String> wordList,
+ int numWordsToPrint) {
+ List<Queue<Pair<String,Double>>> queues = new ArrayList<>();
+ Map<Integer,Double> expSums = new HashMap<>();
+ for (Pair<IntPairWritable,DoubleWritable> record
+ : new SequenceFileDirIterable<IntPairWritable, DoubleWritable>(
+ new Path(dir, "part-*"), PathType.GLOB, null, null, true, job)) {
+ IntPairWritable key = record.getFirst();
+ int topic = key.getFirst();
+ int word = key.getSecond();
+ ensureQueueSize(queues, topic);
+ if (word >= 0 && topic >= 0) {
+ double score = record.getSecond().get();
+ if (expSums.get(topic) == null) {
+ expSums.put(topic, 0.0);
+ }
+ expSums.put(topic, expSums.get(topic) + Math.exp(score));
+ String realWord = wordList.get(word);
+ maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
+ }
+ }
+ for (int i = 0; i < queues.size(); i++) {
+ Queue<Pair<String,Double>> queue = queues.get(i);
+ Queue<Pair<String,Double>> newQueue = new PriorityQueue<>(queue.size());
+ double norm = expSums.get(i);
+ for (Pair<String,Double> pair : queue) {
+ newQueue.add(new Pair<>(pair.getFirst(), Math.exp(pair.getSecond()) / norm));
+ }
+ queues.set(i, newQueue);
+ }
+ return queues;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
new file mode 100644
index 0000000..12ed471
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Custom Lucene Analyzer designed for aggressive feature reduction
+ * for clustering the ASF Mail Archives using an extended set of
+ * stop words, excluding non-alpha-numeric tokens, and porter stemming.
+ */
+public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
+ // extended set of stop words composed of common mail terms like "hi",
+ // HTML tags, and Java keywords asmany of the messages in the archives
+ // are subversion check-in notifications
+
+ private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList(
+ "3d","7bit","a0","about","above","abstract","across","additional","after",
+ "afterwards","again","against","align","all","almost","alone","along",
+ "already","also","although","always","am","among","amongst","amoungst",
+ "amount","an","and","another","any","anybody","anyhow","anyone","anything",
+ "anyway","anywhere","are","arial","around","as","ascii","assert","at",
+ "back","background","base64","bcc","be","became","because","become","becomes",
+ "becoming","been","before","beforehand","behind","being","below","beside",
+ "besides","between","beyond","bgcolor","blank","blockquote","body","boolean",
+ "border","both","br","break","but","by","can","cannot","cant","case","catch",
+ "cc","cellpadding","cellspacing","center","char","charset","cheers","class",
+ "co","color","colspan","com","con","const","continue","could","couldnt",
+ "cry","css","de","dear","default","did","didnt","different","div","do",
+ "does","doesnt","done","dont","double","down","due","during","each","eg",
+ "eight","either","else","elsewhere","empty","encoding","enough","enum",
+ "etc","eu","even","ever","every","everyone","everything","everywhere",
+ "except","extends","face","family","few","ffffff","final","finally","float",
+ "font","for","former","formerly","fri","from","further","get","give","go",
+ "good","got","goto","gt","h1","ha","had","has","hasnt","have","he","head",
+ "height","hello","helvetica","hence","her","here","hereafter","hereby",
+ "herein","hereupon","hers","herself","hi","him","himself","his","how",
+ "however","hr","href","html","http","https","id","ie","if","ill","im",
+ "image","img","implements","import","in","inc","instanceof","int","interface",
+ "into","is","isnt","iso-8859-1","it","its","itself","ive","just","keep",
+ "last","latter","latterly","least","left","less","li","like","long","look",
+ "lt","ltd","mail","mailto","many","margin","may","me","meanwhile","message",
+ "meta","might","mill","mine","mon","more","moreover","most","mostly","mshtml",
+ "mso","much","must","my","myself","name","namely","native","nbsp","need",
+ "neither","never","nevertheless","new","next","nine","no","nobody","none",
+ "noone","nor","not","nothing","now","nowhere","null","of","off","often",
+ "ok","on","once","only","onto","or","org","other","others","otherwise",
+ "our","ours","ourselves","out","over","own","package","pad","per","perhaps",
+ "plain","please","pm","printable","private","protected","public","put",
+ "quot","quote","r1","r2","rather","re","really","regards","reply","return",
+ "right","said","same","sans","sat","say","saying","see","seem","seemed",
+ "seeming","seems","serif","serious","several","she","short","should","show",
+ "side","since","sincere","six","sixty","size","so","solid","some","somehow",
+ "someone","something","sometime","sometimes","somewhere","span","src",
+ "static","still","strictfp","string","strong","style","stylesheet","subject",
+ "such","sun","super","sure","switch","synchronized","table","take","target",
+ "td","text","th","than","thanks","that","the","their","them","themselves",
+ "then","thence","there","thereafter","thereby","therefore","therein","thereupon",
+ "these","they","thick","thin","think","third","this","those","though",
+ "three","through","throughout","throw","throws","thru","thu","thus","tm",
+ "to","together","too","top","toward","towards","tr","transfer","transient",
+ "try","tue","type","ul","un","under","unsubscribe","until","up","upon",
+ "us","use","used","uses","using","valign","verdana","very","via","void",
+ "volatile","want","was","we","wed","weight","well","were","what","whatever",
+ "when","whence","whenever","where","whereafter","whereas","whereby","wherein",
+ "whereupon","wherever","whether","which","while","whither","who","whoever",
+ "whole","whom","whose","why","width","will","with","within","without",
+ "wont","would","wrote","www","yes","yet","you","your","yours","yourself",
+ "yourselves"
+ ), false);
+
+ // Regex used to exclude non-alpha-numeric tokens
+ private static final Pattern ALPHA_NUMERIC = Pattern.compile("^[a-z][a-z0-9_]+$");
+ private static final Matcher MATCHER = ALPHA_NUMERIC.matcher("");
+
+ public MailArchivesClusteringAnalyzer() {
+ super(STOP_SET);
+ }
+
+ public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
+ super(stopSet);
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer();
+ TokenStream result = new StandardFilter(tokenizer);
+ result = new LowerCaseFilter(result);
+ result = new ASCIIFoldingFilter(result);
+ result = new AlphaNumericMaxLengthFilter(result);
+ result = new StopFilter(result, STOP_SET);
+ result = new PorterStemFilter(result);
+ return new TokenStreamComponents(tokenizer, result);
+ }
+
+ /**
+ * Matches alpha-numeric tokens between 2 and 40 chars long.
+ */
+ static class AlphaNumericMaxLengthFilter extends TokenFilter {
+ private final CharTermAttribute termAtt;
+ private final char[] output = new char[28];
+
+ AlphaNumericMaxLengthFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(CharTermAttribute.class);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ // return the first alpha-numeric token between 2 and 40 length
+ while (input.incrementToken()) {
+ int length = termAtt.length();
+ if (length >= 2 && length <= 28) {
+ char[] buf = termAtt.buffer();
+ int at = 0;
+ for (int c = 0; c < length; c++) {
+ char ch = buf[c];
+ if (ch != '\'') {
+ output[at++] = ch;
+ }
+ }
+ String term = new String(output, 0, at);
+ MATCHER.reset(term);
+ if (MATCHER.matches() && !term.startsWith("a0")) {
+ termAtt.setEmpty();
+ termAtt.append(term);
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java
new file mode 100644
index 0000000..44df006
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/MultipleTextFileInputFormat.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
+
+/**
+ *
+ * Used in combining a large number of text files into one text input reader
+ * along with the WholeFileRecordReader class.
+ *
+ */
+public class MultipleTextFileInputFormat extends CombineFileInputFormat<IntWritable, BytesWritable> {
+
+ @Override
+ public RecordReader<IntWritable, BytesWritable> createRecordReader(InputSplit inputSplit,
+ TaskAttemptContext taskAttemptContext)
+ throws IOException {
+ return new CombineFileRecordReader<>((CombineFileSplit) inputSplit,
+ taskAttemptContext, WholeFileRecordReader.class);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
new file mode 100644
index 0000000..37ebc44
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.apache.mahout.utils.io.ChunkedWriter;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Map;
+
+/**
+ * Default parser for parsing text into sequence files.
+ */
+public final class PrefixAdditionFilter extends SequenceFilesFromDirectoryFilter {
+
+ public PrefixAdditionFilter(Configuration conf,
+ String keyPrefix,
+ Map<String, String> options,
+ ChunkedWriter writer,
+ Charset charset,
+ FileSystem fs) {
+ super(conf, keyPrefix, options, writer, charset, fs);
+ }
+
+ @Override
+ protected void process(FileStatus fst, Path current) throws IOException {
+ FileSystem fs = getFs();
+ ChunkedWriter writer = getWriter();
+ if (fst.isDir()) {
+ String dirPath = getPrefix() + Path.SEPARATOR + current.getName() + Path.SEPARATOR + fst.getPath().getName();
+ fs.listStatus(fst.getPath(),
+ new PrefixAdditionFilter(getConf(), dirPath, getOptions(), writer, getCharset(), fs));
+ } else {
+ try (InputStream in = fs.open(fst.getPath())){
+ StringBuilder file = new StringBuilder();
+ for (String aFit : new FileLineIterable(in, getCharset(), false)) {
+ file.append(aFit).append('\n');
+ }
+ String name = current.getName().equals(fst.getPath().getName())
+ ? current.getName()
+ : current.getName() + Path.SEPARATOR + fst.getPath().getName();
+ writer.write(getPrefix() + Path.SEPARATOR + name, file.toString());
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
new file mode 100644
index 0000000..311ab8d
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.utils.io.ChunkedWriter;
+
+/**
+ * Converts a directory of text documents into SequenceFiles of Specified chunkSize. This class takes in a
+ * parent directory containing sub folders of text documents and recursively reads the files and creates the
+ * {@link org.apache.hadoop.io.SequenceFile}s of docid => content. The docid is set as the relative path of the
+ * document from the parent directory prepended with a specified prefix. You can also specify the input encoding
+ * of the text files. The content of the output SequenceFiles are encoded as UTF-8 text.
+ */
+public class SequenceFilesFromDirectory extends AbstractJob {
+
+ private static final String PREFIX_ADDITION_FILTER = PrefixAdditionFilter.class.getName();
+
+ private static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"};
+ public static final String[] FILE_FILTER_CLASS_OPTION = {"fileFilterClass", "filter"};
+ private static final String[] CHARSET_OPTION = {"charset", "c"};
+
+ private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000;
+
+ public static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"};
+ public static final String BASE_INPUT_PATH = "baseinputpath";
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new SequenceFilesFromDirectory(), args);
+ }
+
+ /*
+ * callback main after processing MapReduce parameters
+ */
+ @Override
+ public int run(String[] args) throws Exception {
+ addOptions();
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Map<String, String> options = parseOptions();
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+
+ if (getOption(DefaultOptionCreator.METHOD_OPTION,
+ DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
+ runSequential(getConf(), getInputPath(), output, options);
+ } else {
+ runMapReduce(getInputPath(), output);
+ }
+
+ return 0;
+ }
+
+ private int runSequential(Configuration conf, Path input, Path output, Map<String, String> options)
+ throws IOException, InterruptedException, NoSuchMethodException {
+ // Running sequentially
+ Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
+ String keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
+ FileSystem fs = FileSystem.get(input.toUri(), conf);
+
+ try (ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output)) {
+ SequenceFilesFromDirectoryFilter pathFilter;
+ String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
+ if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
+ pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, charset, fs);
+ } else {
+ pathFilter = ClassUtils.instantiateAs(fileFilterClassName, SequenceFilesFromDirectoryFilter.class,
+ new Class[] {Configuration.class, String.class, Map.class, ChunkedWriter.class, Charset.class, FileSystem.class},
+ new Object[] {conf, keyPrefix, options, writer, charset, fs});
+ }
+ fs.listStatus(input, pathFilter);
+ }
+ return 0;
+ }
+
+ private int runMapReduce(Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException {
+
+ int chunkSizeInMB = 64;
+ if (hasOption(CHUNK_SIZE_OPTION[0])) {
+ chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
+ }
+
+ String keyPrefix = null;
+ if (hasOption(KEY_PREFIX_OPTION[0])) {
+ keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
+ }
+
+ String fileFilterClassName = null;
+ if (hasOption(FILE_FILTER_CLASS_OPTION[0])) {
+ fileFilterClassName = getOption(FILE_FILTER_CLASS_OPTION[0]);
+ }
+
+ PathFilter pathFilter = null;
+ // Prefix Addition is presently handled in the Mapper and unlike runsequential()
+ // need not be done via a pathFilter
+ if (!StringUtils.isBlank(fileFilterClassName) && !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
+ try {
+ pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance();
+ } catch (InstantiationException | IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ // Prepare Job for submission.
+ Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
+ SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
+ SequenceFileOutputFormat.class, "SequenceFilesFromDirectory");
+
+ Configuration jobConfig = job.getConfiguration();
+ jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix);
+ jobConfig.set(FILE_FILTER_CLASS_OPTION[0], fileFilterClassName);
+
+ FileSystem fs = FileSystem.get(jobConfig);
+ FileStatus fsFileStatus = fs.getFileStatus(input);
+
+ String inputDirList;
+ if (pathFilter != null) {
+ inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus, pathFilter);
+ } else {
+ inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
+ }
+
+ jobConfig.set(BASE_INPUT_PATH, input.toString());
+
+ long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
+
+ // set the max split locations, otherwise we get nasty debug stuff
+ jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));
+
+ FileInputFormat.setInputPaths(job, inputDirList);
+ // need to set this to a multiple of the block size, or no split happens
+ FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);
+ FileOutputFormat.setCompressOutput(job, true);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ return 0;
+ }
+
+ /**
+ * Override this method in order to add additional options to the command line of the SequenceFileFromDirectory job.
+ * Do not forget to call super() otherwise all standard options (input/output dirs etc) will not be available.
+ */
+ protected void addOptions() {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
+ addOption(FILE_FILTER_CLASS_OPTION[0], FILE_FILTER_CLASS_OPTION[1],
+ "The name of the class to use for file parsing. Default: " + PREFIX_ADDITION_FILTER, PREFIX_ADDITION_FILTER);
+ addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
+ addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
+ "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
+ }
+
+ /**
+ * Override this method in order to parse your additional options from the command line. Do not forget to call
+ * super() otherwise standard options (input/output dirs etc) will not be available.
+ *
+ * @return Map of options
+ */
+ protected Map<String, String> parseOptions() {
+ Map<String, String> options = new HashMap<>();
+ options.put(CHUNK_SIZE_OPTION[0], getOption(CHUNK_SIZE_OPTION[0]));
+ options.put(FILE_FILTER_CLASS_OPTION[0], getOption(FILE_FILTER_CLASS_OPTION[0]));
+ options.put(CHARSET_OPTION[0], getOption(CHARSET_OPTION[0]));
+ return options;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
new file mode 100644
index 0000000..6e4bd64
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.mahout.utils.io.ChunkedWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Map;
+
+/**
+ * Implement this interface if you wish to extend SequenceFilesFromDirectory with your own parsing logic.
+ */
+public abstract class SequenceFilesFromDirectoryFilter implements PathFilter {
+ private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromDirectoryFilter.class);
+
+ private final String prefix;
+ private final ChunkedWriter writer;
+ private final Charset charset;
+ private final FileSystem fs;
+ private final Map<String, String> options;
+ private final Configuration conf;
+
+ protected SequenceFilesFromDirectoryFilter(Configuration conf,
+ String keyPrefix,
+ Map<String, String> options,
+ ChunkedWriter writer,
+ Charset charset,
+ FileSystem fs) {
+ this.prefix = keyPrefix;
+ this.writer = writer;
+ this.charset = charset;
+ this.fs = fs;
+ this.options = options;
+ this.conf = conf;
+ }
+
+ protected final String getPrefix() {
+ return prefix;
+ }
+
+ protected final ChunkedWriter getWriter() {
+ return writer;
+ }
+
+ protected final Charset getCharset() {
+ return charset;
+ }
+
+ protected final FileSystem getFs() {
+ return fs;
+ }
+
+ protected final Map<String, String> getOptions() {
+ return options;
+ }
+
+ protected final Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public final boolean accept(Path current) {
+ log.debug("CURRENT: {}", current.getName());
+ try {
+ for (FileStatus fst : fs.listStatus(current)) {
+ log.debug("CHILD: {}", fst.getPath().getName());
+ process(fst, current);
+ }
+ } catch (IOException ioe) {
+ throw new IllegalStateException(ioe);
+ }
+ return false;
+ }
+
+ protected abstract void process(FileStatus in, Path current) throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java
new file mode 100644
index 0000000..40df3c2
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryMapper.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
+import org.apache.mahout.common.HadoopUtil;
+
+import static org.apache.mahout.text.SequenceFilesFromDirectory.KEY_PREFIX_OPTION;
+
+/**
+ * Map class for SequenceFilesFromDirectory MR job
+ */
+public class SequenceFilesFromDirectoryMapper extends Mapper<IntWritable, BytesWritable, Text, Text> {
+
+ private String keyPrefix;
+ private Text fileValue = new Text();
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ this.keyPrefix = context.getConfiguration().get(KEY_PREFIX_OPTION[0], "");
+ }
+
+ public void map(IntWritable key, BytesWritable value, Context context)
+ throws IOException, InterruptedException {
+
+ Configuration configuration = context.getConfiguration();
+ Path filePath = ((CombineFileSplit) context.getInputSplit()).getPath(key.get());
+ String relativeFilePath = HadoopUtil.calcRelativeFilePath(configuration, filePath);
+
+ String filename = this.keyPrefix.length() > 0 ?
+ this.keyPrefix + Path.SEPARATOR + relativeFilePath :
+ Path.SEPARATOR + relativeFilePath;
+
+ fileValue.set(value.getBytes(), 0, value.getBytes().length);
+ context.write(new Text(filename), fileValue);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
new file mode 100644
index 0000000..c17cc12
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
@@ -0,0 +1,369 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import org.apache.commons.io.DirectoryWalker;
+import org.apache.commons.io.comparator.CompositeFileComparator;
+import org.apache.commons.io.comparator.DirectoryFileComparator;
+import org.apache.commons.io.comparator.PathFileComparator;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.utils.email.MailOptions;
+import org.apache.mahout.utils.email.MailProcessor;
+import org.apache.mahout.utils.io.ChunkedWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * Converts a directory of gzipped mail archives into SequenceFiles of specified
+ * chunkSize. This class is similar to {@link SequenceFilesFromDirectory} except
+ * it uses block-compressed {@link org.apache.hadoop.io.SequenceFile}s and parses out the subject and
+ * body text of each mail message into a separate key/value pair.
+ */
+public final class SequenceFilesFromMailArchives extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromMailArchives.class);
+
+ public static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"};
+ public static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"};
+ public static final String[] CHARSET_OPTION = {"charset", "c"};
+ public static final String[] SUBJECT_OPTION = {"subject", "s"};
+ public static final String[] TO_OPTION = {"to", "to"};
+ public static final String[] FROM_OPTION = {"from", "from"};
+ public static final String[] REFERENCES_OPTION = {"references", "refs"};
+ public static final String[] BODY_OPTION = {"body", "b"};
+ public static final String[] STRIP_QUOTED_OPTION = {"stripQuoted", "q"};
+ public static final String[] QUOTED_REGEX_OPTION = {"quotedRegex", "regex"};
+ public static final String[] SEPARATOR_OPTION = {"separator", "sep"};
+ public static final String[] BODY_SEPARATOR_OPTION = {"bodySeparator", "bodySep"};
+ public static final String BASE_INPUT_PATH = "baseinputpath";
+
+ private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000;
+
+ public void createSequenceFiles(MailOptions options) throws IOException {
+ try (ChunkedWriter writer =
+ new ChunkedWriter(getConf(), options.getChunkSize(), new Path(options.getOutputDir()))){
+ MailProcessor processor = new MailProcessor(options, options.getPrefix(), writer);
+ if (options.getInput().isDirectory()) {
+ PrefixAdditionDirectoryWalker walker = new PrefixAdditionDirectoryWalker(processor, writer);
+ walker.walk(options.getInput());
+ log.info("Parsed {} messages from {}", walker.getMessageCount(), options.getInput().getAbsolutePath());
+ } else {
+ long start = System.currentTimeMillis();
+ long cnt = processor.parseMboxLineByLine(options.getInput());
+ long finish = System.currentTimeMillis();
+ log.info("Parsed {} messages from {} in time: {}", cnt, options.getInput().getAbsolutePath(), finish - start);
+ }
+ }
+ }
+
+ private static class PrefixAdditionDirectoryWalker extends DirectoryWalker<Object> {
+
+ @SuppressWarnings("unchecked")
+ private static final Comparator<File> FILE_COMPARATOR = new CompositeFileComparator(
+ DirectoryFileComparator.DIRECTORY_REVERSE, PathFileComparator.PATH_COMPARATOR);
+
+ private final Deque<MailProcessor> processors = new ArrayDeque<>();
+ private final ChunkedWriter writer;
+ private final Deque<Long> messageCounts = new ArrayDeque<>();
+
+ public PrefixAdditionDirectoryWalker(MailProcessor processor, ChunkedWriter writer) {
+ processors.addFirst(processor);
+ this.writer = writer;
+ messageCounts.addFirst(0L);
+ }
+
+ public void walk(File startDirectory) throws IOException {
+ super.walk(startDirectory, null);
+ }
+
+ public long getMessageCount() {
+ return messageCounts.getFirst();
+ }
+
+ @Override
+ protected void handleDirectoryStart(File current, int depth, Collection<Object> results) throws IOException {
+ if (depth > 0) {
+ log.info("At {}", current.getAbsolutePath());
+ MailProcessor processor = processors.getFirst();
+ MailProcessor subDirProcessor = new MailProcessor(processor.getOptions(), processor.getPrefix()
+ + File.separator + current.getName(), writer);
+ processors.push(subDirProcessor);
+ messageCounts.push(0L);
+ }
+ }
+
+ @Override
+ protected File[] filterDirectoryContents(File directory, int depth, File[] files) throws IOException {
+ Arrays.sort(files, FILE_COMPARATOR);
+ return files;
+ }
+
+ @Override
+ protected void handleFile(File current, int depth, Collection<Object> results) throws IOException {
+ MailProcessor processor = processors.getFirst();
+ long currentDirMessageCount = messageCounts.pop();
+ try {
+ currentDirMessageCount += processor.parseMboxLineByLine(current);
+ } catch (IOException e) {
+ throw new IllegalStateException("Error processing " + current, e);
+ }
+ messageCounts.push(currentDirMessageCount);
+ }
+
+ @Override
+ protected void handleDirectoryEnd(File current, int depth, Collection<Object> results) throws IOException {
+ if (depth > 0) {
+ final long currentDirMessageCount = messageCounts.pop();
+ log.info("Parsed {} messages from directory {}", currentDirMessageCount, current.getAbsolutePath());
+
+ processors.pop();
+
+ // aggregate message counts
+ long parentDirMessageCount = messageCounts.pop();
+ parentDirMessageCount += currentDirMessageCount;
+ messageCounts.push(parentDirMessageCount);
+ }
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new SequenceFilesFromMailArchives(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.methodOption().create());
+
+ addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
+ addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
+ addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
+ "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
+ addFlag(SUBJECT_OPTION[0], SUBJECT_OPTION[1], "Include the Mail subject as part of the text. Default is false");
+ addFlag(TO_OPTION[0], TO_OPTION[1], "Include the to field in the text. Default is false");
+ addFlag(FROM_OPTION[0], FROM_OPTION[1], "Include the from field in the text. Default is false");
+ addFlag(REFERENCES_OPTION[0], REFERENCES_OPTION[1],
+ "Include the references field in the text. Default is false");
+ addFlag(BODY_OPTION[0], BODY_OPTION[1], "Include the body in the output. Default is false");
+ addFlag(STRIP_QUOTED_OPTION[0], STRIP_QUOTED_OPTION[1],
+ "Strip (remove) quoted email text in the body. Default is false");
+ addOption(QUOTED_REGEX_OPTION[0], QUOTED_REGEX_OPTION[1],
+ "Specify the regex that identifies quoted text. "
+ + "Default is to look for > or | at the beginning of the line.");
+ addOption(SEPARATOR_OPTION[0], SEPARATOR_OPTION[1],
+ "The separator to use between metadata items (to, from, etc.). Default is \\n", "\n");
+ addOption(BODY_SEPARATOR_OPTION[0], BODY_SEPARATOR_OPTION[1],
+ "The separator to use between lines in the body. Default is \\n. "
+ + "Useful to change if you wish to have the message be on one line", "\n");
+
+ addOption(DefaultOptionCreator.helpOption());
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+ File input = getInputFile();
+ String outputDir = getOutputPath().toString();
+
+ int chunkSize = 64;
+ if (hasOption(CHUNK_SIZE_OPTION[0])) {
+ chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
+ }
+
+ String prefix = "";
+ if (hasOption(KEY_PREFIX_OPTION[0])) {
+ prefix = getOption(KEY_PREFIX_OPTION[0]);
+ }
+
+ Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
+ MailOptions options = new MailOptions();
+ options.setInput(input);
+ options.setOutputDir(outputDir);
+ options.setPrefix(prefix);
+ options.setChunkSize(chunkSize);
+ options.setCharset(charset);
+
+ List<Pattern> patterns = new ArrayList<>(5);
+ // patternOrder is used downstream so that we can know what order the text
+ // is in instead of encoding it in the string, which
+ // would require more processing later to remove it pre feature selection.
+ Map<String, Integer> patternOrder = new HashMap<>();
+ int order = 0;
+ if (hasOption(FROM_OPTION[0])) {
+ patterns.add(MailProcessor.FROM_PREFIX);
+ patternOrder.put(MailOptions.FROM, order++);
+ }
+ if (hasOption(TO_OPTION[0])) {
+ patterns.add(MailProcessor.TO_PREFIX);
+ patternOrder.put(MailOptions.TO, order++);
+ }
+ if (hasOption(REFERENCES_OPTION[0])) {
+ patterns.add(MailProcessor.REFS_PREFIX);
+ patternOrder.put(MailOptions.REFS, order++);
+ }
+ if (hasOption(SUBJECT_OPTION[0])) {
+ patterns.add(MailProcessor.SUBJECT_PREFIX);
+ patternOrder.put(MailOptions.SUBJECT, order += 1);
+ }
+ options.setStripQuotedText(hasOption(STRIP_QUOTED_OPTION[0]));
+
+ options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()]));
+ options.setPatternOrder(patternOrder);
+ options.setIncludeBody(hasOption(BODY_OPTION[0]));
+
+ if (hasOption(SEPARATOR_OPTION[0])) {
+ options.setSeparator(getOption(SEPARATOR_OPTION[0]));
+ } else {
+ options.setSeparator("\n");
+ }
+
+ if (hasOption(BODY_SEPARATOR_OPTION[0])) {
+ options.setBodySeparator(getOption(BODY_SEPARATOR_OPTION[0]));
+ }
+
+ if (hasOption(QUOTED_REGEX_OPTION[0])) {
+ options.setQuotedTextPattern(Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])));
+ }
+
+ if (getOption(DefaultOptionCreator.METHOD_OPTION,
+ DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
+ runSequential(options);
+ } else {
+ runMapReduce(getInputPath(), getOutputPath());
+ }
+
+ return 0;
+ }
+
+ private int runSequential(MailOptions options)
+ throws IOException, InterruptedException, NoSuchMethodException {
+
+ long start = System.currentTimeMillis();
+ createSequenceFiles(options);
+ long finish = System.currentTimeMillis();
+ log.info("Conversion took {}ms", finish - start);
+
+ return 0;
+ }
+
+ private int runMapReduce(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException {
+
+ Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromMailArchivesMapper.class,
+ Text.class, Text.class, SequenceFileOutputFormat.class, "SequentialFilesFromMailArchives");
+
+ Configuration jobConfig = job.getConfiguration();
+
+ if (hasOption(KEY_PREFIX_OPTION[0])) {
+ jobConfig.set(KEY_PREFIX_OPTION[1], getOption(KEY_PREFIX_OPTION[0]));
+ }
+
+ int chunkSize = 0;
+ if (hasOption(CHUNK_SIZE_OPTION[0])) {
+ chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
+ jobConfig.set(CHUNK_SIZE_OPTION[0], String.valueOf(chunkSize));
+ }
+
+ Charset charset;
+ if (hasOption(CHARSET_OPTION[0])) {
+ charset = Charset.forName(getOption(CHARSET_OPTION[0]));
+ jobConfig.set(CHARSET_OPTION[0], charset.displayName());
+ }
+
+ if (hasOption(FROM_OPTION[0])) {
+ jobConfig.set(FROM_OPTION[1], "true");
+ }
+
+ if (hasOption(TO_OPTION[0])) {
+ jobConfig.set(TO_OPTION[1], "true");
+ }
+
+ if (hasOption(REFERENCES_OPTION[0])) {
+ jobConfig.set(REFERENCES_OPTION[1], "true");
+ }
+
+ if (hasOption(SUBJECT_OPTION[0])) {
+ jobConfig.set(SUBJECT_OPTION[1], "true");
+ }
+
+ if (hasOption(QUOTED_REGEX_OPTION[0])) {
+ jobConfig.set(QUOTED_REGEX_OPTION[1], Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])).toString());
+ }
+
+ if (hasOption(SEPARATOR_OPTION[0])) {
+ jobConfig.set(SEPARATOR_OPTION[1], getOption(SEPARATOR_OPTION[0]));
+ } else {
+ jobConfig.set(SEPARATOR_OPTION[1], "\n");
+ }
+
+ if (hasOption(BODY_OPTION[0])) {
+ jobConfig.set(BODY_OPTION[1], "true");
+ } else {
+ jobConfig.set(BODY_OPTION[1], "false");
+ }
+
+ if (hasOption(BODY_SEPARATOR_OPTION[0])) {
+ jobConfig.set(BODY_SEPARATOR_OPTION[1], getOption(BODY_SEPARATOR_OPTION[0]));
+ } else {
+ jobConfig.set(BODY_SEPARATOR_OPTION[1], "\n");
+ }
+
+ FileSystem fs = FileSystem.get(jobConfig);
+ FileStatus fsFileStatus = fs.getFileStatus(inputPath);
+
+ jobConfig.set(BASE_INPUT_PATH, inputPath.toString());
+ String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
+ FileInputFormat.setInputPaths(job, inputDirList);
+
+ long chunkSizeInBytes = chunkSize * 1024 * 1024;
+ // need to set this to a multiple of the block size, or no split happens
+ FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);
+
+ // set the max split locations, otherwise we get nasty debug stuff
+ jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ return 0;
+ }
+}

r***@apache.org

2018-06-27 14:52:11 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
new file mode 100644
index 0000000..7db836f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
@@ -0,0 +1,183 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.email;
+
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.apache.mahout.utils.io.ChunkedWriter;
+import org.apache.mahout.utils.io.ChunkedWrapper;
+import org.apache.mahout.utils.io.IOWriterWrapper;
+import org.apache.mahout.utils.io.WrappedWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.Writer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Converts an mbox mail archive into a group of Hadoop Sequence Files with equal size. The archive may optionally be
+ * gzipped or zipped. @see org.apache.mahout.text.SequenceFilesFromMailArchives
+ */
+public class MailProcessor {
+
+ private static final Pattern MESSAGE_START = Pattern.compile("^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern MESSAGE_ID_PREFIX = Pattern.compile("^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);
+ // regular expressions used to parse individual messages
+ public static final Pattern SUBJECT_PREFIX = Pattern.compile("^subject: (.*)$", Pattern.CASE_INSENSITIVE);
+ //we need to have at least one character
+ public static final Pattern FROM_PREFIX = Pattern.compile("^from: (\\S.*)$", Pattern.CASE_INSENSITIVE);
+ public static final Pattern REFS_PREFIX = Pattern.compile("^references: (.*)$", Pattern.CASE_INSENSITIVE);
+ public static final Pattern TO_PREFIX = Pattern.compile("^to: (.*)$", Pattern.CASE_INSENSITIVE);
+
+ private final String prefix;
+ private final MailOptions options;
+ private final WrappedWriter writer;
+
+ private static final Logger log = LoggerFactory.getLogger(MailProcessor.class);
+
+ /**
+ * Creates a {@code MailProcessor} that does not write to sequence files, but to a single text file.
+ * This constructor is for debugging and testing purposes.
+ */
+ public MailProcessor(MailOptions options, String prefix, Writer writer) {
+ this.writer = new IOWriterWrapper(writer);
+ this.options = options;
+ this.prefix = prefix;
+ }
+
+ /**
+ * This is the main constructor of {@code MailProcessor}.
+ */
+ public MailProcessor(MailOptions options, String prefix, ChunkedWriter writer) {
+ this.writer = new ChunkedWrapper(writer);
+ this.options = options;
+ this.prefix = prefix;
+ }
+
+ /**
+ * Parses one complete mail archive, writing output to the {@code writer} constructor parameter.
+ * @param mboxFile mail archive to parse
+ * @return number of parsed mails
+ * @throws IOException
+ */
+ public long parseMboxLineByLine(File mboxFile) throws IOException {
+ long messageCount = 0;
+ try {
+ StringBuilder contents = new StringBuilder();
+ // tmps used during mail message parsing
+ StringBuilder body = new StringBuilder();
+ Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
+ Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
+ String[] patternResults = new String[options.getPatternsToMatch().length];
+ Matcher[] matchers = new Matcher[options.getPatternsToMatch().length];
+ for (int i = 0; i < matchers.length; i++) {
+ matchers[i] = options.getPatternsToMatch()[i].matcher("");
+ }
+
+ String messageId = null;
+ boolean inBody = false;
+ Pattern quotedTextPattern = options.getQuotedTextPattern();
+ for (String nextLine : new FileLineIterable(mboxFile, options.getCharset(), false)) {
+ if (options.isStripQuotedText() && quotedTextPattern.matcher(nextLine).find()) {
+ continue;
+ }
+ for (int i = 0; i < matchers.length; i++) {
+ Matcher matcher = matchers[i];
+ matcher.reset(nextLine);
+ if (matcher.matches()) {
+ patternResults[i] = matcher.group(1);
+ }
+ }
+
+ // only start appending body content after we've seen a message ID
+ if (messageId != null) {
+ // first, see if we hit the end of the message
+ messageBoundaryMatcher.reset(nextLine);
+ if (messageBoundaryMatcher.matches()) {
+ // done parsing this message ... write it out
+ String key = generateKey(mboxFile, prefix, messageId);
+ //if this ordering changes, then also change FromEmailToDictionaryMapper
+ writeContent(options.getSeparator(), contents, body, patternResults);
+ writer.write(key, contents.toString());
+ contents.setLength(0); // reset the buffer
+ body.setLength(0);
+
+ messageId = null;
+ inBody = false;
+ } else {
+ if (inBody && options.isIncludeBody()) {
+ if (!nextLine.isEmpty()) {
+ body.append(nextLine).append(options.getBodySeparator());
+ }
+ } else {
+ // first empty line we see after reading the message Id
+ // indicates that we are in the body ...
+ inBody = nextLine.isEmpty();
+ }
+ }
+ } else {
+ if (nextLine.length() > 14) {
+ messageIdMatcher.reset(nextLine);
+ if (messageIdMatcher.matches()) {
+ messageId = messageIdMatcher.group(1);
+ ++messageCount;
+ }
+ }
+ }
+ }
+ // write the last message in the file if available
+ if (messageId != null) {
+ String key = generateKey(mboxFile, prefix, messageId);
+ writeContent(options.getSeparator(), contents, body, patternResults);
+ writer.write(key, contents.toString());
+ contents.setLength(0); // reset the buffer
+ }
+ } catch (FileNotFoundException e) {
+ // Skip file.
+ log.warn("Unable to process non-existing file", e);
+ }
+ // TODO: report exceptions and continue;
+ return messageCount;
+ }
+
+ protected static String generateKey(File mboxFile, String prefix, String messageId) {
+ return prefix + File.separator + mboxFile.getName() + File.separator + messageId;
+ }
+
+ public String getPrefix() {
+ return prefix;
+ }
+
+ public MailOptions getOptions() {
+ return options;
+ }
+
+ private static void writeContent(String separator, StringBuilder contents, CharSequence body, String[] matches) {
+ for (String match : matches) {
+ if (match != null) {
+ contents.append(match).append(separator);
+ } else {
+ contents.append(separator);
+ }
+ }
+ contents.append('\n').append(body);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
new file mode 100644
index 0000000..473e86a
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.io;
+
+import java.io.IOException;
+
+/**
+ * {@link ChunkedWriter} based implementation of the {@link WrappedWriter} interface.
+ */
+public class ChunkedWrapper implements WrappedWriter {
+
+ private final ChunkedWriter writer;
+
+ public ChunkedWrapper(ChunkedWriter writer) {
+ this.writer = writer;
+ }
+
+ @Override
+ public void write(String key, String value) throws IOException {
+ writer.write(key, value);
+ }
+
+ @Override
+ public void close() throws IOException {
+ writer.close();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
new file mode 100644
index 0000000..66cf15f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWriter.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils.io;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Writes data splitted in multiple Hadoop sequence files of approximate equal size. The data must consist
+ * of key-value pairs, both of them of String type. All sequence files are created in the same
+ * directory and named "chunk-0", "chunk-1", etc.
+ */
+public final class ChunkedWriter implements Closeable {
+
+ private final int maxChunkSizeInBytes;
+ private final Path output;
+ private SequenceFile.Writer writer;
+ private int currentChunkID;
+ private int currentChunkSize;
+ private final FileSystem fs;
+ private final Configuration conf;
+
+ /**
+ * @param conf needed by Hadoop to know what filesystem implementation to use.
+ * @param chunkSizeInMB approximate size of each file, in Megabytes.
+ * @param output directory where the sequence files will be created.
+ * @throws IOException
+ */
+ public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output) throws IOException {
+ this.output = output;
+ this.conf = conf;
+ if (chunkSizeInMB > 1984) {
+ chunkSizeInMB = 1984;
+ }
+ maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
+ fs = FileSystem.get(output.toUri(), conf);
+ currentChunkID = 0;
+ writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), Text.class, Text.class);
+ }
+
+ private Path getPath(int chunkID) {
+ return new Path(output, "chunk-" + chunkID);
+ }
+
+ /** Writes a new key-value pair, creating a new sequence file if necessary.*/
+ public void write(String key, String value) throws IOException {
+ if (currentChunkSize > maxChunkSizeInBytes) {
+ Closeables.close(writer, false);
+ currentChunkID++;
+ writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), Text.class, Text.class);
+ currentChunkSize = 0;
+ }
+
+ Text keyT = new Text(key);
+ Text valueT = new Text(value);
+ currentChunkSize += keyT.getBytes().length + valueT.getBytes().length; // Overhead
+ writer.append(keyT, valueT);
+ }
+
+ @Override
+ public void close() throws IOException {
+ Closeables.close(writer, false);
+ }
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
new file mode 100644
index 0000000..b7c3d42
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.io;
+
+import java.io.IOException;
+import java.io.Writer;
+/**
+ * Implementation of the {@link WrappedWriter} interface based on {@link java.io.Writer}.
+ */
+public class IOWriterWrapper implements WrappedWriter {
+
+ private final Writer writer;
+
+ public IOWriterWrapper(Writer writer) {
+ this.writer = writer;
+ }
+
+ /** Writes a new key and value, separating them with one space. The value must end with a
+ * new line or some other delimiter, as it is not automatically added by this method
+ */
+ @Override
+ public void write(String key, String value) throws IOException {
+ writer.write(key + ' ' + value);
+ }
+
+ @Override
+ public void close() throws IOException {
+ writer.close();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
new file mode 100644
index 0000000..b9900e9
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.io;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Convenience class for wrapping either a java.io.Writer or a SequenceFile.Writer with some basic functionality
+ */
+public interface WrappedWriter extends Closeable {
+
+ /** Writes a new key-value pair.*/
+ void write(String key, String value) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
new file mode 100644
index 0000000..964c8cc
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.util.bloom.Filter;
+import org.apache.hadoop.util.bloom.Key;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Emits tokens based on bloom filter membership.
+ */
+public final class BloomTokenFilter extends TokenFilter {
+
+ private final Filter filter;
+ private final CharTermAttribute termAtt;
+ private final CharsetEncoder encoder;
+ private final Key key;
+ private final boolean keepMembers;
+
+ /**
+ * @param filter tokens will be checked for membership in this bloom filter
+ * @param in the tokenstream to read.
+ * @param keepMembers keep memoers of the bloom filter? If true works like
+ * a whitelist and members found in the list are kept and all others are
+ * dropped. If false works like a stoplist and members found in the
+ * filter are dropped all others are kept.
+ */
+ public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) {
+ super(in);
+ this.filter = filter;
+ this.keepMembers = keepMembers;
+ this.key = new Key();
+ this.termAtt = addAttribute(CharTermAttribute.class);
+ this.encoder = Charsets.UTF_8.newEncoder().
+ onMalformedInput(CodingErrorAction.REPORT).
+ onUnmappableCharacter(CodingErrorAction.REPORT);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (input.incrementToken()) {
+ ByteBuffer bytes = encoder.encode(CharBuffer.wrap(termAtt.buffer(), 0, termAtt.length()));
+ key.set(bytes.array(), 1.0f);
+ boolean member = filter.membershipTest(key);
+ if ((keepMembers && member) || (!keepMembers && !member)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
new file mode 100644
index 0000000..4585a0a
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.mahout.common.lucene.TokenStreamIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class AnalyzerTransformer implements RegexTransformer {
+
+ private Analyzer analyzer;
+ private String fieldName = "text";
+
+ private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class);
+
+ public AnalyzerTransformer() {
+ this(new StandardAnalyzer(), "text");
+ }
+
+ public AnalyzerTransformer(Analyzer analyzer) {
+ this(analyzer, "text");
+ }
+
+ public AnalyzerTransformer(Analyzer analyzer, String fieldName) {
+ this.analyzer = analyzer;
+ this.fieldName = fieldName;
+ }
+
+ @Override
+ public String transformMatch(String match) {
+ StringBuilder result = new StringBuilder();
+ try (TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(match))) {
+ ts.addAttribute(CharTermAttribute.class);
+ ts.reset();
+ TokenStreamIterator iter = new TokenStreamIterator(ts);
+ while (iter.hasNext()) {
+ result.append(iter.next()).append(' ');
+ }
+ ts.end();
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ return result.toString();
+ }
+
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+ public void setAnalyzer(Analyzer analyzer) {
+ this.analyzer = analyzer;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
new file mode 100644
index 0000000..d3e8e06
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import com.google.common.collect.Lists;
+
+import java.util.List;
+
+/**
+ * Chain together several {@link org.apache.mahout.utils.regex.RegexTransformer} and apply them to the match
+ * in succession
+ */
+public class ChainTransformer implements RegexTransformer {
+
+ private List<RegexTransformer> chain = Lists.newArrayList();
+
+ public ChainTransformer() {
+ }
+
+ public ChainTransformer(List<RegexTransformer> chain) {
+ this.chain = chain;
+ }
+
+ @Override
+ public String transformMatch(String match) {
+ String result = match;
+ for (RegexTransformer transformer : chain) {
+ result = transformer.transformMatch(result);
+ }
+ return result;
+ }
+
+ public List<RegexTransformer> getChain() {
+ return chain;
+ }
+
+ public void setChain(List<RegexTransformer> chain) {
+ this.chain = chain;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
new file mode 100644
index 0000000..a0f296d
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import java.util.regex.Pattern;
+
+/**
+ * Collapses/converts all whitespace to a single tab
+ */
+public class FPGFormatter implements RegexFormatter {
+
+ private static final Pattern WHITESPACE = Pattern.compile("\\W+");
+
+ @Override
+ public String format(String toFormat) {
+ return '\t' + WHITESPACE.matcher(toFormat).replaceAll("|");
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
new file mode 100644
index 0000000..5c1177c
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+public class IdentityFormatter implements RegexFormatter {
+
+ @Override
+ public String format(String toFormat) {
+ return toFormat;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
new file mode 100644
index 0000000..aea695d
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+/**
+ * No-op
+ */
+public final class IdentityTransformer implements RegexTransformer {
+
+ @Override
+ public String transformMatch(String match) {
+ return match;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
new file mode 100644
index 0000000..53be239
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+
+/**
+ * Experimental
+ */
+public class RegexConverterDriver extends AbstractJob {
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption("regex", "regex",
+ "The regular expression to use", true);
+ addOption("groupsToKeep", "g",
+ "The number of the capturing groups to keep", false);
+ addOption("transformerClass", "t",
+ "The optional class specifying the Regex Transformer", false);
+ addOption("formatterClass", "t",
+ "The optional class specifying the Regex Formatter", false);
+ addOption(DefaultOptionCreator.analyzerOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Configuration conf = getConf();
+ //TODO: How to deal with command line escaping?
+ conf.set(RegexMapper.REGEX, getOption("regex")); //
+ String gtk = getOption("groupsToKeep");
+ if (gtk != null) {
+ conf.set(RegexMapper.GROUP_MATCHERS, gtk);
+ }
+ String trans = getOption("transformerClass");
+ if (trans != null) {
+ if ("url".equalsIgnoreCase(trans)) {
+ trans = URLDecodeTransformer.class.getName();
+ }
+ conf.set(RegexMapper.TRANSFORMER_CLASS, trans);
+ }
+ String formatter = getOption("formatterClass");
+ if (formatter != null) {
+ if ("fpg".equalsIgnoreCase(formatter)) {
+ formatter = FPGFormatter.class.getName();
+ }
+ conf.set(RegexMapper.FORMATTER_CLASS, formatter);
+ }
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
+ if (analyzerClass != null) {
+ conf.set(RegexMapper.ANALYZER_NAME, analyzerClass.getName());
+ }
+ Job job = prepareJob(input, output,
+ TextInputFormat.class,
+ RegexMapper.class,
+ LongWritable.class,
+ Text.class,
+ TextOutputFormat.class);
+ boolean succeeded = job.waitForCompletion(true);
+ return succeeded ? 0 : -1;
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new RegexConverterDriver(), args);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
new file mode 100644
index 0000000..8ef837b
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+public interface RegexFormatter {
+
+ String format(String toFormat);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
new file mode 100644
index 0000000..04cacaa
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.ClassUtils;
+
+public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
+
+ public static final String REGEX = "regex";
+ public static final String GROUP_MATCHERS = "regex.groups";
+ public static final String TRANSFORMER_CLASS = "transformer.class";
+ public static final String FORMATTER_CLASS = "formatter.class";
+
+ private Pattern regex;
+ private List<Integer> groupsToKeep;
+ private RegexTransformer transformer = RegexUtils.IDENTITY_TRANSFORMER;
+ private RegexFormatter formatter = RegexUtils.IDENTITY_FORMATTER;
+ public static final String ANALYZER_NAME = "analyzerName";
+
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ groupsToKeep = new ArrayList<>();
+ Configuration config = context.getConfiguration();
+ String regexStr = config.get(REGEX);
+ regex = Pattern.compile(regexStr);
+ String[] groups = config.getStrings(GROUP_MATCHERS);
+ if (groups != null) {
+ for (String group : groups) {
+ groupsToKeep.add(Integer.parseInt(group));
+ }
+ }
+
+ transformer = ClassUtils.instantiateAs(config.get(TRANSFORMER_CLASS, IdentityTransformer.class.getName()),
+ RegexTransformer.class);
+ String analyzerName = config.get(ANALYZER_NAME);
+ if (analyzerName != null && transformer instanceof AnalyzerTransformer) {
+ Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class);
+ ((AnalyzerTransformer)transformer).setAnalyzer(analyzer);
+ }
+
+ formatter = ClassUtils.instantiateAs(config.get(FORMATTER_CLASS, IdentityFormatter.class.getName()),
+ RegexFormatter.class);
+ }
+
+
+ @Override
+ protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
+ String result = RegexUtils.extract(text.toString(), regex, groupsToKeep, " ", transformer);
+ if (!result.isEmpty()) {
+ String format = formatter.format(result);
+ context.write(key, new Text(format));
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
new file mode 100644
index 0000000..adbc98f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+/**
+ * Transforms the match of a regular expression.
+ */
+public interface RegexTransformer {
+
+ String transformMatch(String match);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
new file mode 100644
index 0000000..5e32b99
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public final class RegexUtils {
+
+ public static final RegexTransformer IDENTITY_TRANSFORMER = new IdentityTransformer();
+ public static final RegexFormatter IDENTITY_FORMATTER = new IdentityFormatter();
+
+ private RegexUtils() {
+ }
+
+ public static String extract(CharSequence line, Pattern pattern, Collection<Integer> groupsToKeep,
+ String separator, RegexTransformer transformer) {
+ StringBuilder bldr = new StringBuilder();
+ extract(line, bldr, pattern, groupsToKeep, separator, transformer);
+ return bldr.toString();
+ }
+
+ public static void extract(CharSequence line, StringBuilder outputBuffer,
+ Pattern pattern, Collection<Integer> groupsToKeep, String separator,
+ RegexTransformer transformer) {
+ if (transformer == null) {
+ transformer = IDENTITY_TRANSFORMER;
+ }
+ Matcher matcher = pattern.matcher(line);
+ String match;
+ if (groupsToKeep.isEmpty()) {
+ while (matcher.find()) {
+ match = matcher.group();
+ if (match != null) {
+ outputBuffer.append(transformer.transformMatch(match)).append(separator);
+ }
+ }
+ } else {
+ while (matcher.find()) {
+ for (Integer groupNum : groupsToKeep) {
+ match = matcher.group(groupNum);
+ if (match != null) {
+ outputBuffer.append(transformer.transformMatch(match)).append(separator);
+ }
+ }
+ }
+ }
+ //trim off the last separator, which is always there
+ if (outputBuffer.length() > 0) {
+ outputBuffer.setLength(outputBuffer.length() - separator.length());
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
new file mode 100644
index 0000000..3eb7fc0
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.regex;
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+
+public final class URLDecodeTransformer implements RegexTransformer {
+
+ private final String enc;
+
+ public URLDecodeTransformer() {
+ enc = "UTF-8";
+ }
+
+ public URLDecodeTransformer(String encoding) {
+ this.enc = encoding;
+ }
+
+ @Override
+ public String transformMatch(String match) {
+ try {
+ return URLDecoder.decode(match, enc);
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
new file mode 100644
index 0000000..13d61b8
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Converts a vector representation of documents into a {@code document x terms} matrix.
+ * The input data is in {@code SequenceFile<Text,VectorWritable>} format (as generated by
+ * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles SparseVectorsFromSequenceFiles}
+ * or by {@link org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles EncodedVectorsFromSequenceFiles})
+ * and generates the following two files as output:
+ * <ul><li>A file called "matrix" of format {@code SequenceFile<IntWritable,VectorWritable>}.</li>
+ * <li>A file called "docIndex" of format {@code SequenceFile<IntWritable,Text>}.</li></ul>
+ * The input file can be regenerated by joining the two output files on the generated int key.
+ * In other words, {@code RowIdJob} replaces the document text ids by integers.
+ * The original document text ids can still be retrieved from the "docIndex".
+ */
+public class RowIdJob extends AbstractJob {
+ private static final Logger log = LoggerFactory.getLogger(RowIdJob.class);
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ Configuration conf = getConf();
+ FileSystem fs = FileSystem.get(conf);
+
+ Path outputPath = getOutputPath();
+ Path indexPath = new Path(outputPath, "docIndex");
+ Path matrixPath = new Path(outputPath, "matrix");
+
+ try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath,
+ IntWritable.class, Text.class);
+ SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath, IntWritable.class,
+ VectorWritable.class)) {
+ IntWritable docId = new IntWritable();
+ int i = 0;
+ int numCols = 0;
+ for (Pair<Text, VectorWritable> record
+ : new SequenceFileDirIterable<Text, VectorWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(),
+ null, true, conf)) {
+ VectorWritable value = record.getSecond();
+ docId.set(i);
+ indexWriter.append(docId, record.getFirst());
+ matrixWriter.append(docId, value);
+ i++;
+ numCols = value.get().size();
+ }
+
+ log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath);
+ return 0;
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new RowIdJob(), args);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
new file mode 100644
index 0000000..d74803f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+/**
+ * Each entry in a {@link TermInfo} dictionary. Contains information about a term.
+ */
+public class TermEntry {
+
+ private final String term;
+ private final int termIdx;
+ private final int docFreq;
+
+ public TermEntry(String term, int termIdx, int docFreq) {
+ this.term = term;
+ this.termIdx = termIdx;
+ this.docFreq = docFreq;
+ }
+
+ public String getTerm() {
+ return term;
+ }
+
+ public int getTermIdx() {
+ return termIdx;
+ }
+
+ public int getDocFreq() {
+ return docFreq;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
new file mode 100644
index 0000000..4fb36a3
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import java.util.Iterator;
+
+/**
+ * Contains the term dictionary information associated with a vectorized collection of text documents
+ *
+ */
+public interface TermInfo {
+
+ int totalTerms(String field);
+
+ TermEntry getTermEntry(String field, String term);
+
+ Iterator<TermEntry> getAllEntries();
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
new file mode 100644
index 0000000..e1c3fbc
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import com.google.common.collect.Sets;
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump
+ * out the results using {@link Vector#asFormatString()} to either the console or to a
+ * file.
+ */
+public final class VectorDumper extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
+
+ private VectorDumper() {
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ /**
+ Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
+ abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Sequence File containing the Vectors").withShortName("s").create();
+ Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
+ abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
+ .withDescription("The directory containing Sequence File of Vectors")
+ .withShortName("d").create();
+ */
+ addInputOption();
+ addOutputOption();
+ addOption("useKey", "u", "If the Key is a vector than dump that instead");
+ addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
+ addOption("dictionary", "d", "The dictionary file.", false);
+ addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
+ addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries");
+ addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
+ + "(if the vector is one) printing out the name");
+ addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
+ addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude "
+ + "descending order");
+ addOption("quiet", "q", "Print only file contents");
+ addOption("sizeOnly", "sz", "Dump only the size of the vector");
+ addOption("numItems", "ni", "Output at most <n> vecors", false);
+ addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in"
+ + " conjunction with -sort", false);
+ addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter."
+ + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null));
+
+ if (parseArguments(args, false, true) == null) {
+ return -1;
+ }
+
+ Path[] pathArr;
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ Path input = getInputPath();
+ FileStatus fileStatus = fs.getFileStatus(input);
+ if (fileStatus.isDir()) {
+ pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
+ } else {
+ FileStatus[] inputPaths = fs.globStatus(input);
+ pathArr = new Path[inputPaths.length];
+ int i = 0;
+ for (FileStatus fstatus : inputPaths) {
+ pathArr[i++] = fstatus.getPath();
+ }
+ }
+
+
+ String dictionaryType = getOption("dictionaryType", "text");
+
+ boolean sortVectors = hasOption("sortVectors");
+ boolean quiet = hasOption("quiet");
+ if (!quiet) {
+ log.info("Sort? {}", sortVectors);
+ }
+
+ String[] dictionary = null;
+ if (hasOption("dictionary")) {
+ String dictFile = getOption("dictionary");
+ switch (dictionaryType) {
+ case "text":
+ dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
+ break;
+ case "sequencefile":
+ dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
+ break;
+ default:
+ //TODO: support Lucene's FST as a dictionary type
+ throw new IOException("Invalid dictionary type: " + dictionaryType);
+ }
+ }
+
+ Set<String> filters;
+ if (hasOption("filter")) {
+ filters = Sets.newHashSet(getOptions("filter"));
+ } else {
+ filters = null;
+ }
+
+ boolean useCSV = hasOption("csv");
+
+ boolean sizeOnly = hasOption("sizeOnly");
+ boolean nameOnly = hasOption("nameOnly");
+ boolean namesAsComments = hasOption("namesAsComments");
+ boolean transposeKeyValue = hasOption("vectorAsKey");
+ Writer writer;
+ boolean shouldClose;
+ File output = getOutputFile();
+ if (output != null) {
+ shouldClose = true;
+ log.info("Output file: {}", output);
+ Files.createParentDirs(output);
+ writer = Files.newWriter(output, Charsets.UTF_8);
+ } else {
+ shouldClose = false;
+ writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
+ }
+ try {
+ boolean printKey = hasOption("printKey");
+ if (useCSV && dictionary != null) {
+ writer.write("#");
+ for (int j = 0; j < dictionary.length; j++) {
+ writer.write(dictionary[j]);
+ if (j < dictionary.length - 1) {
+ writer.write(',');
+ }
+ }
+ writer.write('\n');
+ }
+ Long numItems = null;
+ if (hasOption("numItems")) {
+ numItems = Long.parseLong(getOption("numItems"));
+ if (quiet) {
+ writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
+ }
+ }
+ int maxIndexesPerVector = hasOption("vectorSize")
+ ? Integer.parseInt(getOption("vectorSize"))
+ : Integer.MAX_VALUE;
+ long itemCount = 0;
+ int fileCount = 0;
+ for (Path path : pathArr) {
+ if (numItems != null && numItems <= itemCount) {
+ break;
+ }
+ if (quiet) {
+ log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
+ }
+ SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf);
+ Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
+ long i = 0;
+ while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
+ Pair<Writable, Writable> record = iterator.next();
+ Writable keyWritable = record.getFirst();
+ Writable valueWritable = record.getSecond();
+ if (printKey) {
+ Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
+ writer.write(notTheVectorWritable.toString());
+ writer.write('\t');
+ }
+ Vector vector;
+ try {
+ vector = ((VectorWritable)
+ (transposeKeyValue ? keyWritable : valueWritable)).get();
+ } catch (ClassCastException e) {
+ if ((transposeKeyValue ? keyWritable : valueWritable)
+ instanceof WeightedPropertyVectorWritable) {
+ vector =
+ ((WeightedPropertyVectorWritable)
+ (transposeKeyValue ? keyWritable : valueWritable)).getVector();
+ } else {
+ throw e;
+ }
+ }
+ if (filters == null
+ || !(vector instanceof NamedVector)
+ || filters.contains(((NamedVector) vector).getName())) {
+ if (sizeOnly) {
+ if (vector instanceof NamedVector) {
+ writer.write(((NamedVector) vector).getName());
+ writer.write(":");
+ } else {
+ writer.write(String.valueOf(i++));
+ writer.write(":");
+ }
+ writer.write(String.valueOf(vector.size()));
+ writer.write('\n');
+ } else if (nameOnly) {
+ if (vector instanceof NamedVector) {
+ writer.write(((NamedVector) vector).getName());
+ writer.write('\n');
+ }
+ } else {
+ String fmtStr;
+ if (useCSV) {
+ fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
+ } else {
+ fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
+ sortVectors);
+ }
+ writer.write(fmtStr);
+ writer.write('\n');
+ }
+ itemCount++;
+ }
+ }
+ }
+ writer.flush();
+ } finally {
+ if (shouldClose) {
+ Closeables.close(writer, false);
+ }
+ }
+
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new VectorDumper(), args);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
new file mode 100644
index 0000000..66c3fb6
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
@@ -0,0 +1,256 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Collections2;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Pattern;
+
+/** Static utility methods related to vectors. */
+public final class VectorHelper {
+
+ private static final Pattern TAB_PATTERN = Pattern.compile("\t");
+
+
+ private VectorHelper() {
+ }
+
+ public static String vectorToCSVString(Vector vector, boolean namesAsComments) throws IOException {
+ Appendable bldr = new StringBuilder(2048);
+ vectorToCSVString(vector, namesAsComments, bldr);
+ return bldr.toString();
+ }
+
+ public static String buildJson(Iterable<Pair<String, Double>> iterable) {
+ return buildJson(iterable, new StringBuilder(2048));
+ }
+
+ public static String buildJson(Iterable<Pair<String, Double>> iterable, StringBuilder bldr) {
+ bldr.append('{');
+ for (Pair<String, Double> p : iterable) {
+ bldr.append(p.getFirst());
+ bldr.append(':');
+ bldr.append(p.getSecond());
+ bldr.append(',');
+ }
+ if (bldr.length() > 1) {
+ bldr.setCharAt(bldr.length() - 1, '}');
+ }
+ return bldr.toString();
+ }
+
+ public static List<Pair<Integer, Double>> topEntries(Vector vector, int maxEntries) {
+
+ // Get the size of nonZero elements in the input vector
+ int sizeOfNonZeroElementsInVector = vector.getNumNonZeroElements();
+
+ // If the sizeOfNonZeroElementsInVector < maxEntries then set maxEntries = sizeOfNonZeroElementsInVector
+ // otherwise the call to queue.pop() returns a Pair(null, null) and the subsequent call
+ // to pair.getFirst() throws a NullPointerException
+ if (sizeOfNonZeroElementsInVector < maxEntries) {
+ maxEntries = sizeOfNonZeroElementsInVector;
+ }
+
+ PriorityQueue<Pair<Integer, Double>> queue = new TDoublePQ<>(-1, maxEntries);
+ for (Element e : vector.nonZeroes()) {
+ queue.insertWithOverflow(Pair.of(e.index(), e.get()));
+ }
+ List<Pair<Integer, Double>> entries = new ArrayList<>();
+ Pair<Integer, Double> pair;
+ while ((pair = queue.pop()) != null) {
+ if (pair.getFirst() > -1) {
+ entries.add(pair);
+ }
+ }
+ Collections.sort(entries, new Comparator<Pair<Integer, Double>>() {
+ @Override
+ public int compare(Pair<Integer, Double> a, Pair<Integer, Double> b) {
+ return b.getSecond().compareTo(a.getSecond());
+ }
+ });
+ return entries;
+ }
+
+ public static List<Pair<Integer, Double>> firstEntries(Vector vector, int maxEntries) {
+ List<Pair<Integer, Double>> entries = new ArrayList<>();
+ Iterator<Vector.Element> it = vector.nonZeroes().iterator();
+ int i = 0;
+ while (it.hasNext() && i++ < maxEntries) {
+ Vector.Element e = it.next();
+ entries.add(Pair.of(e.index(), e.get()));
+ }
+ return entries;
+ }
+
+ public static List<Pair<String, Double>> toWeightedTerms(Collection<Pair<Integer, Double>> entries,
+ final String[] dictionary) {
+ if (dictionary != null) {
+ return new ArrayList<>(Collections2.transform(entries,
+ new Function<Pair<Integer, Double>, Pair<String, Double>>() {
+ @Override
+ public Pair<String, Double> apply(Pair<Integer, Double> p) {
+ return Pair.of(dictionary[p.getFirst()], p.getSecond());
+ }
+ }));
+ } else {
+ return new ArrayList<>(Collections2.transform(entries,
+ new Function<Pair<Integer, Double>, Pair<String, Double>>() {
+ @Override
+ public Pair<String, Double> apply(Pair<Integer, Double> p) {
+ return Pair.of(Integer.toString(p.getFirst()), p.getSecond());
+ }
+ }));
+ }
+ }
+
+ public static String vectorToJson(Vector vector, String[] dictionary, int maxEntries, boolean sort) {
+ return buildJson(toWeightedTerms(sort
+ ? topEntries(vector, maxEntries)
+ : firstEntries(vector, maxEntries), dictionary));
+ }
+
+ public static void vectorToCSVString(Vector vector,
+ boolean namesAsComments,
+ Appendable bldr) throws IOException {
+ if (namesAsComments && vector instanceof NamedVector) {
+ bldr.append('#').append(((NamedVector) vector).getName()).append('\n');
+ }
+ Iterator<Vector.Element> iter = vector.all().iterator();
+ boolean first = true;
+ while (iter.hasNext()) {
+ if (first) {
+ first = false;
+ } else {
+ bldr.append(',');
+ }
+ Vector.Element elt = iter.next();
+ bldr.append(String.valueOf(elt.get()));
+ }
+ bldr.append('\n');
+ }
+
+ /**
+ * Read in a dictionary file. Format is:
+ * 
+ * <pre>
+ * term DocFreq Index
+ * </pre>
+ */
+ public static String[] loadTermDictionary(File dictFile) throws IOException {
+ try (InputStream in = new FileInputStream(dictFile)) {
+ return loadTermDictionary(in);
+ }
+ }
+
+ /**
+ * Read a dictionary in {@link org.apache.hadoop.io.SequenceFile} generated by
+ * {@link org.apache.mahout.vectorizer.DictionaryVectorizer}
+ *
+ * @param filePattern <PATH TO DICTIONARY>/dictionary.file-*
+ */
+ public static String[] loadTermDictionary(Configuration conf, String filePattern) {
+ OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<>();
+ int maxIndexValue = 0;
+ for (Pair<Text, IntWritable> record
+ : new SequenceFileDirIterable<Text, IntWritable>(new Path(filePattern), PathType.GLOB, null, null, true,
+ conf)) {
+ dict.put(record.getFirst().toString(), record.getSecond().get());
+ if (record.getSecond().get() > maxIndexValue) {
+ maxIndexValue = record.getSecond().get();
+ }
+ }
+ // Set dictionary size to greater of (maxIndexValue + 1, dict.size())
+ int maxDictionarySize = maxIndexValue + 1 > dict.size() ? maxIndexValue + 1 : dict.size();
+ String[] dictionary = new String[maxDictionarySize];
+ for (String feature : dict.keys()) {
+ dictionary[dict.get(feature)] = feature;
+ }
+ return dictionary;
+ }
+
+ /**
+ * Read in a dictionary file. Format is: First line is the number of entries
+ * 
+ * <pre>
+ * term DocFreq Index
+ * </pre>
+ */
+ private static String[] loadTermDictionary(InputStream is) throws IOException {
+ FileLineIterator it = new FileLineIterator(is);
+
+ int numEntries = Integer.parseInt(it.next());
+ String[] result = new String[numEntries];
+
+ while (it.hasNext()) {
+ String line = it.next();
+ if (line.startsWith("#")) {
+ continue;
+ }
+ String[] tokens = TAB_PATTERN.split(line);
+ if (tokens.length < 3) {
+ continue;
+ }
+ int index = Integer.parseInt(tokens[2]); // tokens[1] is the doc freq
+ result[index] = tokens[0];
+ }
+ return result;
+ }
+
+ private static final class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> {
+ private final T sentinel;
+
+ private TDoublePQ(T sentinel, int size) {
+ super(size);
+ this.sentinel = sentinel;
+ }
+
+ @Override
+ protected boolean lessThan(Pair<T, Double> a, Pair<T, Double> b) {
+ return a.getSecond().compareTo(b.getSecond()) < 0;
+ }
+
+ @Override
+ protected Pair<T, Double> getSentinelObject() {
+ return Pair.of(sentinel, Double.NEGATIVE_INFINITY);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
new file mode 100644
index 0000000..f2632a4
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+final class ARFFIterator extends AbstractIterator<Vector> {
+
+ // This pattern will make sure a , inside a string is not a point for split.
+ // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string
+ private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*");
+ private static final Pattern DATA_PATTERN = Pattern.compile("^\\"+ARFFModel.ARFF_SPARSE+"(.*)\\"+ARFFModel.ARFF_SPARSE_END+"$");
+
+ private final BufferedReader reader;
+ private final ARFFModel model;
+
+ ARFFIterator(BufferedReader reader, ARFFModel model) {
+ this.reader = reader;
+ this.model = model;
+ }
+
+ @Override
+ protected Vector computeNext() {
+ String line;
+ try {
+ while ((line = reader.readLine()) != null) {
+ line = line.trim();
+ if (!line.isEmpty() && !line.startsWith(ARFFModel.ARFF_COMMENT)) {
+ break;
+ }
+ }
+ } catch (IOException ioe) {
+ throw new IllegalStateException(ioe);
+ }
+ if (line == null) {
+ try {
+ Closeables.close(reader, true);
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ return endOfData();
+ }
+ Vector result;
+ Matcher contents = DATA_PATTERN.matcher(line);
+ if (contents.find()) {
+ line = contents.group(1);
+ String[] splits = splitCSV(line);
+ result = new RandomAccessSparseVector(model.getLabelSize());
+ for (String split : splits) {
+ int idIndex = split.indexOf(' ');
+ int idx = Integer.parseInt(split.substring(0, idIndex).trim());
+ String data = split.substring(idIndex).trim();
+ if (!"?".equals(data)) {
+ result.setQuick(idx, model.getValue(data, idx));
+ }
+ }
+ } else {
+ result = new DenseVector(model.getLabelSize());
+ String[] splits = splitCSV(line);
+ for (int i = 0; i < splits.length; i++) {
+ String split = splits[i];
+ split = split.trim();
+ if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && !"?".equals(split)) {
+ result.setQuick(i, model.getValue(split, i));
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Splits a string by comma, ignores commas inside quotes and escaped quotes.
+ * As quotes are both double and single possible, because there is no exact definition
+ * for ARFF files
+ * @param line -
+ * @return String[]
+ */
+ public static String[] splitCSV(String line) {
+ StringBuilder sb = new StringBuilder(128);
+ List<String> tokens = new ArrayList<>();
+ char escapeChar = '\0';
+ for (int i = 0; i < line.length(); i++) {
+ char c = line.charAt(i);
+ if (c == '\\') {
+ i++;
+ sb.append(line.charAt(i));
+ }
+ else if (c == '"' || c == '\'') {
+ // token is closed
+ if (c == escapeChar) {
+ escapeChar = '\0';
+ }
+ else if (escapeChar == '\0') {
+ escapeChar = c;
+ }
+ sb.append(c);
+ }
+ else if (c == ',') {
+ if (escapeChar == '\0') {
+ tokens.add(sb.toString().trim());
+ sb.setLength(0); // start work on next token
+ }
+ else {
+ sb.append(c);
+ }
+ }
+ else {
+ sb.append(c);
+ }
+ }
+ if (sb.length() > 0) {
+ tokens.add(sb.toString().trim());
+ }
+
+ return tokens.toArray(new String[tokens.size()]);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
new file mode 100644
index 0000000..fc86997
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import java.text.DateFormat;
+import java.util.Map;
+
+/**
+ * An interface for representing an ARFFModel. Implementations can decide on the best approach
+ * for storing the model, as some approaches will be fine for smaller files, while larger
+ * ones may require a better implementation.
+ */
+public interface ARFFModel {
+ String ARFF_SPARSE = "{"; //indicates the vector is sparse
+ String ARFF_SPARSE_END = "}";
+ String ARFF_COMMENT = "%";
+ String ATTRIBUTE = "@attribute";
+ String DATA = "@data";
+ String RELATION = "@relation";
+
+
+ String getRelation();
+
+ void setRelation(String relation);
+
+ /**
+ * The vector attributes (labels in Mahout speak)
+ * @return the map
+ */
+ Map<String, Integer> getLabelBindings();
+
+ Integer getNominalValue(String label, String nominal);
+
+ void addNominal(String label, String nominal, int idx);
+
+ DateFormat getDateFormat(Integer idx);
+
+ void addDateFormat(Integer idx, DateFormat format);
+
+ Integer getLabelIndex(String label);
+
+ void addLabel(String label, Integer idx);
+
+ ARFFType getARFFType(Integer idx);
+
+ void addType(Integer idx, ARFFType type);
+
+ /**
+ * The count of the number of words seen
+ * @return the count
+ */
+ long getWordCount();
+
+ double getValue(String data, int idx);
+
+ Map<String, Map<String, Integer>> getNominalMap();
+
+ int getLabelSize();
+
+ Map<String, Long> getWords();
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
new file mode 100644
index 0000000..9ba7c31
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+public enum ARFFType {
+
+ NUMERIC("numeric"),
+ INTEGER("integer"),
+ REAL("real"),
+ NOMINAL("{"),
+ DATE("date"),
+ STRING("string");
+
+ private final String indicator;
+
+ ARFFType(String indicator) {
+ this.indicator = indicator;
+ }
+
+ public String getIndicator() {
+ return indicator;
+ }
+
+ public String getLabel(String line) {
+ int idx = line.lastIndexOf(indicator);
+ return removeQuotes(line.substring(ARFFModel.ATTRIBUTE.length(), idx));
+ }
+
+ /**
+ * Remove quotes and leading/trailing whitespace from a single or double quoted string
+ * @param str quotes from
+ * @return A string without quotes
+ */
+ public static String removeQuotes(String str) {
+ String cleaned = str;
+ if (cleaned != null) {
+ cleaned = cleaned.trim();
+ boolean isQuoted = cleaned.length() > 1
+ && (cleaned.startsWith("\"") && cleaned.endsWith("\"")
+ || cleaned.startsWith("'") && cleaned.endsWith("'"));
+ if (isQuoted) {
+ cleaned = cleaned.substring(1, cleaned.length() - 1);
+ }
+ }
+ return cleaned;
+ }
+}

r***@apache.org

2018-06-27 14:52:12 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInput.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInput.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInput.java
new file mode 100644
index 0000000..6178f80
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInput.java
@@ -0,0 +1,673 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.BitSet;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.apache.mahout.math.jet.random.sampling.RandomSampler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A utility for splitting files in the input format used by the Bayes
+ * classifiers or anything else that has one item per line or SequenceFiles (key/value)
+ * into training and test sets in order to perform cross-validation.
+ * 
+ * 
+ * This class can be used to split directories of files or individual files into
+ * training and test sets using a number of different methods.
+ * 
+ * When executed via {@link #splitDirectory(Path)} or {@link #splitFile(Path)},
+ * the lines read from one or more, input files are written to files of the same
+ * name into the directories specified by the
+ * {@link #setTestOutputDirectory(Path)} and
+ * {@link #setTrainingOutputDirectory(Path)} methods.
+ * 
+ * The composition of the test set is determined using one of the following
+ * approaches:
+ * <ul>
+ * <li>A contiguous set of items can be chosen from the input file(s) using the
+ * {@link #setTestSplitSize(int)} or {@link #setTestSplitPct(int)} methods.
+ * {@link #setTestSplitSize(int)} allocates a fixed number of items, while
+ * {@link #setTestSplitPct(int)} allocates a percentage of the original input,
+ * rounded up to the nearest integer. {@link #setSplitLocation(int)} is used to
+ * control the position in the input from which the test data is extracted and
+ * is described further below.</li>
+ * <li>A random sampling of items can be chosen from the input files(s) using
+ * the {@link #setTestRandomSelectionSize(int)} or
+ * {@link #setTestRandomSelectionPct(int)} methods, each choosing a fixed test
+ * set size or percentage of the input set size as described above. The
+ * {@link RandomSampler} class from {@code mahout-math} is used to create a sample
+ * of the appropriate size.</li>
+ * </ul>
+ * 
+ * Any one of the methods above can be used to control the size of the test set.
+ * If multiple methods are called, a runtime exception will be thrown at
+ * execution time.
+ * 
+ * The {@link #setSplitLocation(int)} method is passed an integer from 0 to 100
+ * (inclusive) which is translated into the position of the start of the test
+ * data within the input file.
+ * 
+ * Given:
+ * <ul>
+ * <li>an input file of 1500 lines</li>
+ * <li>a desired test data size of 10 percent</li>
+ * </ul>
+ * 
+ * <ul>
+ * <li>A split location of 0 will cause the first 150 items appearing in the
+ * input set to be written to the test set.</li>
+ * <li>A split location of 25 will cause items 375-525 to be written to the test
+ * set.</li>
+ * <li>A split location of 100 will cause the last 150 items in the input to be
+ * written to the test set</li>
+ * </ul>
+ * The start of the split will always be adjusted forwards in order to ensure
+ * that the desired test set size is allocated. Split location has no effect is
+ * random sampling is employed.
+ */
+public class SplitInput extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SplitInput.class);
+
+ private int testSplitSize = -1;
+ private int testSplitPct = -1;
+ private int splitLocation = 100;
+ private int testRandomSelectionSize = -1;
+ private int testRandomSelectionPct = -1;
+ private int keepPct = 100;
+ private Charset charset = Charsets.UTF_8;
+ private boolean useSequence;
+ private boolean useMapRed;
+
+ private Path inputDirectory;
+ private Path trainingOutputDirectory;
+ private Path testOutputDirectory;
+ private Path mapRedOutputDirectory;
+
+ private SplitCallback callback;
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ if (parseArgs(args)) {
+ splitDirectory();
+ }
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new SplitInput(), args);
+ }
+
+ /**
+ * Configure this instance based on the command-line arguments contained within provided array.
+ * Calls {@link #validate()} to ensure consistency of configuration.
+ *
+ * @return true if the arguments were parsed successfully and execution should proceed.
+ * @throws Exception if there is a problem parsing the command-line arguments or the particular
+ * combination would violate class invariants.
+ */
+ private boolean parseArgs(String[] args) throws Exception {
+
+ addInputOption();
+ addOption("trainingOutput", "tr", "The training data output directory", false);
+ addOption("testOutput", "te", "The test data output directory", false);
+ addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
+ addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
+ addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file "
+ + "size (0=start, 50=middle, 100=end", false);
+ addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
+ addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using "
+ + "mapreduce mode", false);
+ addOption("charset", "c", "The name of the character encoding of the input files (not needed if using "
+ + "SequenceFiles)", false);
+ addOption(buildOption("sequenceFiles", "seq", "Set if the input files are sequence files. Default is false",
+ false, false, "false"));
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ //TODO: extend this to sequential mode
+ addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored. "
+ + "Default is 100%", false);
+ addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);
+
+ if (parseArguments(args) == null) {
+ return false;
+ }
+
+ try {
+ inputDirectory = getInputPath();
+
+ useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD);
+
+ if (useMapRed) {
+ if (!hasOption("randomSelectionPct")) {
+ throw new OptionException(getCLIOption("randomSelectionPct"),
+ "must set randomSelectionPct when mapRed option is used");
+ }
+ if (!hasOption("mapRedOutputDir")) {
+ throw new OptionException(getCLIOption("mapRedOutputDir"),
+ "mapRedOutputDir must be set when mapRed option is used");
+ }
+ mapRedOutputDirectory = new Path(getOption("mapRedOutputDir"));
+ if (hasOption("keepPct")) {
+ keepPct = Integer.parseInt(getOption("keepPct"));
+ }
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), mapRedOutputDirectory);
+ }
+ } else {
+ if (!hasOption("trainingOutput")
+ || !hasOption("testOutput")) {
+ throw new OptionException(getCLIOption("trainingOutput"),
+ "trainingOutput and testOutput must be set if mapRed option is not used");
+ }
+ if (!hasOption("testSplitSize")
+ && !hasOption("testSplitPct")
+ && !hasOption("randomSelectionPct")
+ && !hasOption("randomSelectionSize")) {
+ throw new OptionException(getCLIOption("testSplitSize"),
+ "must set one of test split size/percentage or randomSelectionSize/percentage");
+ }
+
+ trainingOutputDirectory = new Path(getOption("trainingOutput"));
+ testOutputDirectory = new Path(getOption("testOutput"));
+ FileSystem fs = trainingOutputDirectory.getFileSystem(getConf());
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(fs.getConf(), trainingOutputDirectory);
+ HadoopUtil.delete(fs.getConf(), testOutputDirectory);
+ }
+ fs.mkdirs(trainingOutputDirectory);
+ fs.mkdirs(testOutputDirectory);
+ }
+
+ if (hasOption("charset")) {
+ charset = Charset.forName(getOption("charset"));
+ }
+
+ if (hasOption("testSplitSize") && hasOption("testSplitPct")) {
+ throw new OptionException(getCLIOption("testSplitPct"), "must have either split size or split percentage "
+ + "option, not BOTH");
+ }
+
+ if (hasOption("testSplitSize")) {
+ setTestSplitSize(Integer.parseInt(getOption("testSplitSize")));
+ }
+
+ if (hasOption("testSplitPct")) {
+ setTestSplitPct(Integer.parseInt(getOption("testSplitPct")));
+ }
+
+ if (hasOption("splitLocation")) {
+ setSplitLocation(Integer.parseInt(getOption("splitLocation")));
+ }
+
+ if (hasOption("randomSelectionSize")) {
+ setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize")));
+ }
+
+ if (hasOption("randomSelectionPct")) {
+ setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct")));
+ }
+
+ useSequence = hasOption("sequenceFiles");
+
+ } catch (OptionException e) {
+ log.error("Command-line option Exception", e);
+ CommandLineUtil.printHelp(getGroup());
+ return false;
+ }
+
+ validate();
+ return true;
+ }
+
+ /**
+ * Perform a split on directory specified by {@link #setInputDirectory(Path)} by calling {@link #splitFile(Path)}
+ * on each file found within that directory.
+ */
+ public void splitDirectory() throws IOException, ClassNotFoundException, InterruptedException {
+ this.splitDirectory(inputDirectory);
+ }
+
+ /**
+ * Perform a split on the specified directory by calling {@link #splitFile(Path)} on each file found within that
+ * directory.
+ */
+ public void splitDirectory(Path inputDir) throws IOException, ClassNotFoundException, InterruptedException {
+ Configuration conf = getConf();
+ splitDirectory(conf, inputDir);
+ }
+
+ /*
+ * See also splitDirectory(Path inputDir)
+ * */
+ public void splitDirectory(Configuration conf, Path inputDir)
+ throws IOException, ClassNotFoundException, InterruptedException {
+ FileSystem fs = inputDir.getFileSystem(conf);
+ if (fs.getFileStatus(inputDir) == null) {
+ throw new IOException(inputDir + " does not exist");
+ }
+ if (!fs.getFileStatus(inputDir).isDir()) {
+ throw new IOException(inputDir + " is not a directory");
+ }
+
+ if (useMapRed) {
+ SplitInputJob.run(conf, inputDir, mapRedOutputDirectory,
+ keepPct, testRandomSelectionPct);
+ } else {
+ // input dir contains one file per category.
+ FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter());
+ for (FileStatus inputFile : fileStats) {
+ if (!inputFile.isDir()) {
+ splitFile(inputFile.getPath());
+ }
+ }
+ }
+ }
+
+ /**
+ * Perform a split on the specified input file. Results will be written to files of the same name in the specified
+ * training and test output directories. The {@link #validate()} method is called prior to executing the split.
+ */
+ public void splitFile(Path inputFile) throws IOException {
+ Configuration conf = getConf();
+ FileSystem fs = inputFile.getFileSystem(conf);
+ if (fs.getFileStatus(inputFile) == null) {
+ throw new IOException(inputFile + " does not exist");
+ }
+ if (fs.getFileStatus(inputFile).isDir()) {
+ throw new IOException(inputFile + " is a directory");
+ }
+
+ validate();
+
+ Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
+ Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());
+
+ int lineCount = countLines(fs, inputFile, charset);
+
+ log.info("{} has {} lines", inputFile.getName(), lineCount);
+
+ int testSplitStart = 0;
+ int testSplitSize = this.testSplitSize; // don't modify state
+ BitSet randomSel = null;
+
+ if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
+ testSplitSize = this.testRandomSelectionSize;
+
+ if (testRandomSelectionPct > 0) {
+ testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f);
+ }
+ log.info("{} test split size is {} based on random selection percentage {}",
+ inputFile.getName(), testSplitSize, testRandomSelectionPct);
+ long[] ridx = new long[testSplitSize];
+ RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
+ randomSel = new BitSet(lineCount);
+ for (long idx : ridx) {
+ randomSel.set((int) idx + 1);
+ }
+ } else {
+ if (testSplitPct > 0) { // calculate split size based on percentage
+ testSplitSize = Math.round(lineCount * testSplitPct / 100.0f);
+ log.info("{} test split size is {} based on percentage {}",
+ inputFile.getName(), testSplitSize, testSplitPct);
+ } else {
+ log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
+ }
+
+ if (splitLocation > 0) { // calculate start of split based on percentage
+ testSplitStart = Math.round(lineCount * splitLocation / 100.0f);
+ if (lineCount - testSplitStart < testSplitSize) {
+ // adjust split start downwards based on split size.
+ testSplitStart = lineCount - testSplitSize;
+ }
+ log.info("{} test split start is {} based on split location {}",
+ inputFile.getName(), testSplitStart, splitLocation);
+ }
+
+ if (testSplitStart < 0) {
+ throw new IllegalArgumentException("test split size for " + inputFile + " is too large, it would produce an "
+ + "empty training set from the initial set of " + lineCount + " examples");
+ } else if (lineCount - testSplitSize < testSplitSize) {
+ log.warn("Test set size for {} may be too large, {} is larger than the number of "
+ + "lines remaining in the training set: {}",
+ inputFile, testSplitSize, lineCount - testSplitSize);
+ }
+ }
+ int trainCount = 0;
+ int testCount = 0;
+ if (!useSequence) {
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
+ Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
+ Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset)){
+
+ String line;
+ int pos = 0;
+ while ((line = reader.readLine()) != null) {
+ pos++;
+
+ Writer writer;
+ if (testRandomSelectionPct > 0) { // Randomly choose
+ writer = randomSel.get(pos) ? testWriter : trainingWriter;
+ } else { // Choose based on location
+ writer = pos > testSplitStart ? testWriter : trainingWriter;
+ }
+
+ if (writer == testWriter) {
+ if (testCount >= testSplitSize) {
+ writer = trainingWriter;
+ } else {
+ testCount++;
+ }
+ }
+ if (writer == trainingWriter) {
+ trainCount++;
+ }
+ writer.write(line);
+ writer.write('\n');
+ }
+
+ }
+ } else {
+ try (SequenceFileIterator<Writable, Writable> iterator =
+ new SequenceFileIterator<>(inputFile, false, fs.getConf());
+ SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile,
+ iterator.getKeyClass(), iterator.getValueClass());
+ SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile,
+ iterator.getKeyClass(), iterator.getValueClass())) {
+
+ int pos = 0;
+ while (iterator.hasNext()) {
+ pos++;
+ SequenceFile.Writer writer;
+ if (testRandomSelectionPct > 0) { // Randomly choose
+ writer = randomSel.get(pos) ? testWriter : trainingWriter;
+ } else { // Choose based on location
+ writer = pos > testSplitStart ? testWriter : trainingWriter;
+ }
+
+ if (writer == testWriter) {
+ if (testCount >= testSplitSize) {
+ writer = trainingWriter;
+ } else {
+ testCount++;
+ }
+ }
+ if (writer == trainingWriter) {
+ trainCount++;
+ }
+ Pair<Writable, Writable> pair = iterator.next();
+ writer.append(pair.getFirst(), pair.getSecond());
+ }
+
+ }
+ }
+ log.info("file: {}, input: {} train: {}, test: {} starting at {}",
+ inputFile.getName(), lineCount, trainCount, testCount, testSplitStart);
+
+ // testing;
+ if (callback != null) {
+ callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
+ }
+ }
+
+ public int getTestSplitSize() {
+ return testSplitSize;
+ }
+
+ public void setTestSplitSize(int testSplitSize) {
+ this.testSplitSize = testSplitSize;
+ }
+
+ public int getTestSplitPct() {
+ return testSplitPct;
+ }
+
+ /**
+ * Sets the percentage of the input data to allocate to the test split
+ *
+ * @param testSplitPct a value between 0 and 100 inclusive.
+ */
+ public void setTestSplitPct(int testSplitPct) {
+ this.testSplitPct = testSplitPct;
+ }
+
+ /**
+ * Sets the percentage of the input data to keep in a map reduce split input job
+ *
+ * @param keepPct a value between 0 and 100 inclusive.
+ */
+ public void setKeepPct(int keepPct) {
+ this.keepPct = keepPct;
+ }
+
+ /**
+ * Set to true to use map reduce to split the input
+ *
+ * @param useMapRed a boolean to indicate whether map reduce should be used
+ */
+ public void setUseMapRed(boolean useMapRed) {
+ this.useMapRed = useMapRed;
+ }
+
+ public void setMapRedOutputDirectory(Path mapRedOutputDirectory) {
+ this.mapRedOutputDirectory = mapRedOutputDirectory;
+ }
+
+ public int getSplitLocation() {
+ return splitLocation;
+ }
+
+ /**
+ * Set the location of the start of the test/training data split. Expressed as percentage of lines, for example
+ * 0 indicates that the test data should be taken from the start of the file, 100 indicates that the test data
+ * should be taken from the end of the input file, while 25 indicates that the test data should be taken from the
+ * first quarter of the file.
+ * 
+ * This option is only relevant in cases where random selection is not employed
+ *
+ * @param splitLocation a value between 0 and 100 inclusive.
+ */
+ public void setSplitLocation(int splitLocation) {
+ this.splitLocation = splitLocation;
+ }
+
+ public Charset getCharset() {
+ return charset;
+ }
+
+ /**
+ * Set the charset used to read and write files
+ */
+ public void setCharset(Charset charset) {
+ this.charset = charset;
+ }
+
+ public Path getInputDirectory() {
+ return inputDirectory;
+ }
+
+ /**
+ * Set the directory from which input data will be read when the the {@link #splitDirectory()} method is invoked
+ */
+ public void setInputDirectory(Path inputDir) {
+ this.inputDirectory = inputDir;
+ }
+
+ public Path getTrainingOutputDirectory() {
+ return trainingOutputDirectory;
+ }
+
+ /**
+ * Set the directory to which training data will be written.
+ */
+ public void setTrainingOutputDirectory(Path trainingOutputDir) {
+ this.trainingOutputDirectory = trainingOutputDir;
+ }
+
+ public Path getTestOutputDirectory() {
+ return testOutputDirectory;
+ }
+
+ /**
+ * Set the directory to which test data will be written.
+ */
+ public void setTestOutputDirectory(Path testOutputDir) {
+ this.testOutputDirectory = testOutputDir;
+ }
+
+ public SplitCallback getCallback() {
+ return callback;
+ }
+
+ /**
+ * Sets the callback used to inform the caller that an input file has been successfully split
+ */
+ public void setCallback(SplitCallback callback) {
+ this.callback = callback;
+ }
+
+ public int getTestRandomSelectionSize() {
+ return testRandomSelectionSize;
+ }
+
+ /**
+ * Sets number of random input samples that will be saved to the test set.
+ */
+ public void setTestRandomSelectionSize(int testRandomSelectionSize) {
+ this.testRandomSelectionSize = testRandomSelectionSize;
+ }
+
+ public int getTestRandomSelectionPct() {
+
+ return testRandomSelectionPct;
+ }
+
+ /**
+ * Sets number of random input samples that will be saved to the test set as a percentage of the size of the
+ * input set.
+ *
+ * @param randomSelectionPct a value between 0 and 100 inclusive.
+ */
+ public void setTestRandomSelectionPct(int randomSelectionPct) {
+ this.testRandomSelectionPct = randomSelectionPct;
+ }
+
+ /**
+ * Validates that the current instance is in a consistent state
+ *
+ * @throws IllegalArgumentException if settings violate class invariants.
+ * @throws IOException if output directories do not exist or are not directories.
+ */
+ public void validate() throws IOException {
+ Preconditions.checkArgument(testSplitSize >= 1 || testSplitSize == -1,
+ "Invalid testSplitSize: " + testSplitSize + ". Must be: testSplitSize >= 1 or testSplitSize = -1");
+ Preconditions.checkArgument(splitLocation >= 0 && splitLocation <= 100 || splitLocation == -1,
+ "Invalid splitLocation percentage: " + splitLocation + ". Must be: 0 <= splitLocation <= 100 or splitLocation = -1");
+ Preconditions.checkArgument(testSplitPct >= 0 && testSplitPct <= 100 || testSplitPct == -1,
+ "Invalid testSplitPct percentage: " + testSplitPct + ". Must be: 0 <= testSplitPct <= 100 or testSplitPct = -1");
+ Preconditions.checkArgument(testRandomSelectionPct >= 0 && testRandomSelectionPct <= 100
+ || testRandomSelectionPct == -1,"Invalid testRandomSelectionPct percentage: " + testRandomSelectionPct +
+ ". Must be: 0 <= testRandomSelectionPct <= 100 or testRandomSelectionPct = -1");
+
+ Preconditions.checkArgument(trainingOutputDirectory != null || useMapRed,
+ "No training output directory was specified");
+ Preconditions.checkArgument(testOutputDirectory != null || useMapRed, "No test output directory was specified");
+
+ // only one of the following may be set, one must be set.
+ int count = 0;
+ if (testSplitSize > 0) {
+ count++;
+ }
+ if (testSplitPct > 0) {
+ count++;
+ }
+ if (testRandomSelectionSize > 0) {
+ count++;
+ }
+ if (testRandomSelectionPct > 0) {
+ count++;
+ }
+
+ Preconditions.checkArgument(count == 1, "Exactly one of testSplitSize, testSplitPct, testRandomSelectionSize, "
+ + "testRandomSelectionPct should be set");
+
+ if (!useMapRed) {
+ Configuration conf = getConf();
+ FileSystem fs = trainingOutputDirectory.getFileSystem(conf);
+ FileStatus trainingOutputDirStatus = fs.getFileStatus(trainingOutputDirectory);
+ Preconditions.checkArgument(trainingOutputDirStatus != null && trainingOutputDirStatus.isDir(),
+ "%s is not a directory", trainingOutputDirectory);
+ FileStatus testOutputDirStatus = fs.getFileStatus(testOutputDirectory);
+ Preconditions.checkArgument(testOutputDirStatus != null && testOutputDirStatus.isDir(),
+ "%s is not a directory", testOutputDirectory);
+ }
+ }
+
+ /**
+ * Count the lines in the file specified as returned by {@code BufferedReader.readLine()}
+ *
+ * @param inputFile the file whose lines will be counted
+ * @param charset the charset of the file to read
+ * @return the number of lines in the input file.
+ * @throws IOException if there is a problem opening or reading the file.
+ */
+ public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException {
+ int lineCount = 0;
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset))){
+ while (reader.readLine() != null) {
+ lineCount++;
+ }
+ }
+ return lineCount;
+ }
+
+ /**
+ * Used to pass information back to a caller once a file has been split without the need for a data object
+ */
+ public interface SplitCallback {
+ void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java
new file mode 100644
index 0000000..4a1ff86
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+
+/**
+ * Class which implements a map reduce version of SplitInput.
+ * This class takes a SequenceFile input, e.g. a set of training data
+ * for a learning algorithm, downsamples it, applies a random
+ * permutation and splits it into test and training sets
+ */
+public final class SplitInputJob {
+
+ private static final String DOWNSAMPLING_FACTOR = "SplitInputJob.downsamplingFactor";
+ private static final String RANDOM_SELECTION_PCT = "SplitInputJob.randomSelectionPct";
+ private static final String TRAINING_TAG = "training";
+ private static final String TEST_TAG = "test";
+
+ private SplitInputJob() {}
+
+ /**
+ * Run job to downsample, randomly permute and split data into test and
+ * training sets. This job takes a SequenceFile as input and outputs two
+ * SequenceFiles test-r-00000 and training-r-00000 which contain the test and
+ * training sets respectively
+ *
+ * @param initialConf
+ * Initial configuration
+ * @param inputPath
+ * path to input data SequenceFile
+ * @param outputPath
+ * path for output data SequenceFiles
+ * @param keepPct
+ * percentage of key value pairs in input to keep. The rest are
+ * discarded
+ * @param randomSelectionPercent
+ * percentage of key value pairs to allocate to test set. Remainder
+ * are allocated to training set
+ */
+ @SuppressWarnings("rawtypes")
+ public static void run(Configuration initialConf, Path inputPath,
+ Path outputPath, int keepPct, float randomSelectionPercent)
+ throws IOException, ClassNotFoundException, InterruptedException {
+
+ int downsamplingFactor = (int) (100.0 / keepPct);
+ initialConf.setInt(DOWNSAMPLING_FACTOR, downsamplingFactor);
+ initialConf.setFloat(RANDOM_SELECTION_PCT, randomSelectionPercent);
+
+ // Determine class of keys and values
+ FileSystem fs = FileSystem.get(initialConf);
+
+ SequenceFileDirIterator<? extends WritableComparable, Writable> iterator =
+ new SequenceFileDirIterator<>(inputPath,
+ PathType.LIST, PathFilters.partFilter(), null, false, fs.getConf());
+ Class<? extends WritableComparable> keyClass;
+ Class<? extends Writable> valueClass;
+ if (iterator.hasNext()) {
+ Pair<? extends WritableComparable, Writable> pair = iterator.next();
+ keyClass = pair.getFirst().getClass();
+ valueClass = pair.getSecond().getClass();
+ } else {
+ throw new IllegalStateException("Couldn't determine class of the input values");
+ }
+
+ Job job = new Job(new Configuration(initialConf));
+
+ MultipleOutputs.addNamedOutput(job, TRAINING_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
+ MultipleOutputs.addNamedOutput(job, TEST_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
+ job.setJarByClass(SplitInputJob.class);
+ FileInputFormat.addInputPath(job, inputPath);
+ FileOutputFormat.setOutputPath(job, outputPath);
+ job.setNumReduceTasks(1);
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(SplitInputMapper.class);
+ job.setReducerClass(SplitInputReducer.class);
+ job.setSortComparatorClass(SplitInputComparator.class);
+ job.setOutputKeyClass(keyClass);
+ job.setOutputValueClass(valueClass);
+ job.submit();
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+
+ /** Mapper which downsamples the input by downsamplingFactor */
+ public static class SplitInputMapper extends
+ Mapper<WritableComparable<?>, Writable, WritableComparable<?>, Writable> {
+
+ private int downsamplingFactor;
+
+ @Override
+ public void setup(Context ctx) {
+ downsamplingFactor = ctx.getConfiguration().getInt(DOWNSAMPLING_FACTOR, 1);
+ }
+
+ /** Only run map() for one out of every downsampleFactor inputs */
+ @Override
+ public void run(Context context) throws IOException, InterruptedException {
+ setup(context);
+ int i = 0;
+ while (context.nextKeyValue()) {
+ if (i % downsamplingFactor == 0) {
+ map(context.getCurrentKey(), context.getCurrentValue(), context);
+ }
+ i++;
+ }
+ cleanup(context);
+ }
+
+ }
+
+ /** Reducer which uses MultipleOutputs to randomly allocate key value pairs between test and training outputs */
+ public static class SplitInputReducer extends
+ Reducer<WritableComparable<?>, Writable, WritableComparable<?>, Writable> {
+
+ private MultipleOutputs multipleOutputs;
+ private final Random rnd = RandomUtils.getRandom();
+ private float randomSelectionPercent;
+
+ @Override
+ protected void setup(Context ctx) throws IOException {
+ randomSelectionPercent = ctx.getConfiguration().getFloat(RANDOM_SELECTION_PCT, 0);
+ multipleOutputs = new MultipleOutputs(ctx);
+ }
+
+ /**
+ * Randomly allocate key value pairs between test and training sets.
+ * randomSelectionPercent of the pairs will go to the test set.
+ */
+ @Override
+ protected void reduce(WritableComparable<?> key, Iterable<Writable> values,
+ Context context) throws IOException, InterruptedException {
+ for (Writable value : values) {
+ if (rnd.nextInt(100) < randomSelectionPercent) {
+ multipleOutputs.write(TEST_TAG, key, value);
+ } else {
+ multipleOutputs.write(TRAINING_TAG, key, value);
+ }
+ }
+
+ }
+
+ @Override
+ protected void cleanup(Context context) throws IOException {
+ try {
+ multipleOutputs.close();
+ } catch (InterruptedException e) {
+ throw new IOException(e);
+ }
+ }
+
+ }
+
+ /** Randomly permute key value pairs */
+ public static class SplitInputComparator extends WritableComparator implements Serializable {
+
+ private final Random rnd = RandomUtils.getRandom();
+
+ protected SplitInputComparator() {
+ super(WritableComparable.class);
+ }
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ if (rnd.nextBoolean()) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
new file mode 100644
index 0000000..ac884d0
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.clustering;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Base class for implementing ClusterWriter
+ */
+public abstract class AbstractClusterWriter implements ClusterWriter {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractClusterWriter.class);
+
+ protected final Writer writer;
+ protected final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
+ protected final DistanceMeasure measure;
+
+ /**
+ *
+ * @param writer The underlying {@link java.io.Writer} to use
+ * @param clusterIdToPoints The map between cluster ids {@link org.apache.mahout.clustering.Cluster#getId()} and the
+ * points in the cluster
+ * @param measure The {@link org.apache.mahout.common.distance.DistanceMeasure} used to calculate the distance.
+ * Some writers may wish to use it for calculating weights for display. May be null.
+ */
+ protected AbstractClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
+ DistanceMeasure measure) {
+ this.writer = writer;
+ this.clusterIdToPoints = clusterIdToPoints;
+ this.measure = measure;
+ }
+
+ protected Writer getWriter() {
+ return writer;
+ }
+
+ protected Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() {
+ return clusterIdToPoints;
+ }
+
+ public static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) {
+
+ StringBuilder sb = new StringBuilder(100);
+
+ for (Pair<String, Double> item : getTopPairs(vector, dictionary, numTerms)) {
+ String term = item.getFirst();
+ sb.append("\n\t\t");
+ sb.append(StringUtils.rightPad(term, 40));
+ sb.append("=>");
+ sb.append(StringUtils.leftPad(item.getSecond().toString(), 20));
+ }
+ return sb.toString();
+ }
+
+ public static String getTopTerms(Vector vector, String[] dictionary, int numTerms) {
+
+ StringBuilder sb = new StringBuilder(100);
+
+ for (Pair<String, Double> item : getTopPairs(vector, dictionary, numTerms)) {
+ String term = item.getFirst();
+ sb.append(term).append('_');
+ }
+ sb.deleteCharAt(sb.length() - 1);
+ return sb.toString();
+ }
+
+ @Override
+ public long write(Iterable<ClusterWritable> iterable) throws IOException {
+ return write(iterable, Long.MAX_VALUE);
+ }
+
+ @Override
+ public void close() throws IOException {
+ writer.close();
+ }
+
+ @Override
+ public long write(Iterable<ClusterWritable> iterable, long maxDocs) throws IOException {
+ long result = 0;
+ Iterator<ClusterWritable> iterator = iterable.iterator();
+ while (result < maxDocs && iterator.hasNext()) {
+ write(iterator.next());
+ result++;
+ }
+ return result;
+ }
+
+ private static Collection<Pair<String, Double>> getTopPairs(Vector vector, String[] dictionary, int numTerms) {
+ List<TermIndexWeight> vectorTerms = Lists.newArrayList();
+
+ for (Vector.Element elt : vector.nonZeroes()) {
+ vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
+ }
+
+ // Sort results in reverse order (ie weight in descending order)
+ Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
+ @Override
+ public int compare(TermIndexWeight one, TermIndexWeight two) {
+ return Double.compare(two.weight, one.weight);
+ }
+ });
+
+ Collection<Pair<String, Double>> topTerms = Lists.newLinkedList();
+
+ for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
+ int index = vectorTerms.get(i).index;
+ String dictTerm = dictionary[index];
+ if (dictTerm == null) {
+ log.error("Dictionary entry missing for {}", index);
+ continue;
+ }
+ topTerms.add(new Pair<>(dictTerm, vectorTerms.get(i).weight));
+ }
+
+ return topTerms;
+ }
+
+ private static class TermIndexWeight {
+ private final int index;
+ private final double weight;
+
+ TermIndexWeight(int index, double weight) {
+ this.index = index;
+ this.weight = weight;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
new file mode 100644
index 0000000..7269016
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.clustering;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * Format is adjacency style as put forth at http://gephi.org/users/supported-graph-formats/csv-format/, the centroid
+ * is the first element and all the rest of the row are the points in that cluster
+ *
+ **/
+public class CSVClusterWriter extends AbstractClusterWriter {
+
+ private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
+
+ public CSVClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
+ DistanceMeasure measure) {
+ super(writer, clusterIdToPoints, measure);
+ }
+
+ @Override
+ public void write(ClusterWritable clusterWritable) throws IOException {
+ StringBuilder line = new StringBuilder();
+ Cluster cluster = clusterWritable.getValue();
+ line.append(cluster.getId());
+ List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(cluster.getId());
+ if (points != null) {
+ for (WeightedPropertyVectorWritable point : points) {
+ Vector theVec = point.getVector();
+ line.append(',');
+ if (theVec instanceof NamedVector) {
+ line.append(((NamedVector)theVec).getName());
+ } else {
+ String vecStr = theVec.asFormatString();
+ //do some basic manipulations for display
+ vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
+ line.append(vecStr);
+ }
+ }
+ getWriter().append(line).append("\n");
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
new file mode 100644
index 0000000..75b5ded
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
@@ -0,0 +1,328 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.clustering;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.mahout.clustering.cdbw.CDbwEvaluator;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.utils.vectors.VectorHelper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class ClusterDumper extends AbstractJob {
+
+ public static final String SAMPLE_POINTS = "samplePoints";
+ DistanceMeasure measure;
+
+ public enum OUTPUT_FORMAT {
+ TEXT,
+ CSV,
+ GRAPH_ML,
+ JSON,
+ }
+
+ public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
+ public static final String DICTIONARY_OPTION = "dictionary";
+ public static final String POINTS_DIR_OPTION = "pointsDir";
+ public static final String NUM_WORDS_OPTION = "numWords";
+ public static final String SUBSTRING_OPTION = "substring";
+ public static final String EVALUATE_CLUSTERS = "evaluate";
+
+ public static final String OUTPUT_FORMAT_OPT = "outputFormat";
+
+ private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
+ private Path seqFileDir;
+ private Path pointsDir;
+ private long maxPointsPerCluster = Long.MAX_VALUE;
+ private String termDictionary;
+ private String dictionaryFormat;
+ private int subString = Integer.MAX_VALUE;
+ private int numTopFeatures = 10;
+ private Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
+ private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT;
+ private boolean runEvaluation;
+
+ public ClusterDumper(Path seqFileDir, Path pointsDir) {
+ this.seqFileDir = seqFileDir;
+ this.pointsDir = pointsDir;
+ init();
+ }
+
+ public ClusterDumper() {
+ setConf(new Configuration());
+ }
+
+ public static void main(String[] args) throws Exception {
+ new ClusterDumper().run(args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the results. Options: TEXT, CSV, JSON or GRAPH_ML",
+ "TEXT");
+ addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print");
+ addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
+ addOption(POINTS_DIR_OPTION, "p",
+ "The directory containing points sequence files mapping input vectors to their cluster. "
+ + "If specified, then the program will output the points associated with a cluster");
+ addOption(SAMPLE_POINTS, "sp", "Specifies the maximum number of points to include _per_ cluster. The default "
+ + "is to include all points");
+ addOption(DICTIONARY_OPTION, "d", "The dictionary file");
+ addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text");
+ addOption(buildOption(EVALUATE_CLUSTERS, "e", "Run ClusterEvaluator and CDbwEvaluator over the input. "
+ + "The output will be appended to the rest of the output at the end.", false, false, null));
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+
+ // output is optional, will print to System.out per default
+ if (parseArguments(args, false, true) == null) {
+ return -1;
+ }
+
+ seqFileDir = getInputPath();
+ if (hasOption(POINTS_DIR_OPTION)) {
+ pointsDir = new Path(getOption(POINTS_DIR_OPTION));
+ }
+ outputFile = getOutputFile();
+ if (hasOption(SUBSTRING_OPTION)) {
+ int sub = Integer.parseInt(getOption(SUBSTRING_OPTION));
+ if (sub >= 0) {
+ subString = sub;
+ }
+ }
+ termDictionary = getOption(DICTIONARY_OPTION);
+ dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION);
+ if (hasOption(NUM_WORDS_OPTION)) {
+ numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION));
+ }
+ if (hasOption(OUTPUT_FORMAT_OPT)) {
+ outputFormat = OUTPUT_FORMAT.valueOf(getOption(OUTPUT_FORMAT_OPT));
+ }
+ if (hasOption(SAMPLE_POINTS)) {
+ maxPointsPerCluster = Long.parseLong(getOption(SAMPLE_POINTS));
+ } else {
+ maxPointsPerCluster = Long.MAX_VALUE;
+ }
+ runEvaluation = hasOption(EVALUATE_CLUSTERS);
+ String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class);
+
+ init();
+ printClusters(null);
+ return 0;
+ }
+
+ public void printClusters(String[] dictionary) throws Exception {
+ Configuration conf = new Configuration();
+
+ if (this.termDictionary != null) {
+ if ("text".equals(dictionaryFormat)) {
+ dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
+ } else if ("sequencefile".equals(dictionaryFormat)) {
+ dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary);
+ } else {
+ throw new IllegalArgumentException("Invalid dictionary format");
+ }
+ }
+
+ Writer writer;
+ boolean shouldClose;
+ if (this.outputFile == null) {
+ shouldClose = false;
+ writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
+ } else {
+ shouldClose = true;
+ if (outputFile.getName().startsWith("s3n://")) {
+ Path p = outputPath;
+ FileSystem fs = FileSystem.get(p.toUri(), conf);
+ writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8);
+ } else {
+ Files.createParentDirs(outputFile);
+ writer = Files.newWriter(this.outputFile, Charsets.UTF_8);
+ }
+ }
+ ClusterWriter clusterWriter = createClusterWriter(writer, dictionary);
+ try {
+ long numWritten = clusterWriter.write(new SequenceFileDirValueIterable<ClusterWritable>(new Path(seqFileDir,
+ "part-*"), PathType.GLOB, conf));
+
+ writer.flush();
+ if (runEvaluation) {
+ HadoopUtil.delete(conf, new Path("tmp/representative"));
+ int numIters = 5;
+ RepresentativePointsDriver.main(new String[]{
+ "--input", seqFileDir.toString(),
+ "--output", "tmp/representative",
+ "--clusteredPoints", pointsDir.toString(),
+ "--distanceMeasure", measure.getClass().getName(),
+ "--maxIter", String.valueOf(numIters)
+ });
+ conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, measure.getClass().getName());
+ conf.set(RepresentativePointsDriver.STATE_IN_KEY, "tmp/representative/representativePoints-" + numIters);
+ ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir);
+ writer.append("\n");
+ writer.append("Inter-Cluster Density: ").append(String.valueOf(ce.interClusterDensity())).append("\n");
+ writer.append("Intra-Cluster Density: ").append(String.valueOf(ce.intraClusterDensity())).append("\n");
+ CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir);
+ writer.append("CDbw Inter-Cluster Density: ").append(String.valueOf(cdbw.interClusterDensity())).append("\n");
+ writer.append("CDbw Intra-Cluster Density: ").append(String.valueOf(cdbw.intraClusterDensity())).append("\n");
+ writer.append("CDbw Separation: ").append(String.valueOf(cdbw.separation())).append("\n");
+ writer.flush();
+ }
+ log.info("Wrote {} clusters", numWritten);
+ } finally {
+ if (shouldClose) {
+ Closeables.close(clusterWriter, false);
+ } else {
+ if (clusterWriter instanceof GraphMLClusterWriter) {
+ clusterWriter.close();
+ }
+ }
+ }
+ }
+
+ ClusterWriter createClusterWriter(Writer writer, String[] dictionary) throws IOException {
+ ClusterWriter result;
+
+ switch (outputFormat) {
+ case TEXT:
+ result = new ClusterDumperWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary, subString);
+ break;
+ case CSV:
+ result = new CSVClusterWriter(writer, clusterIdToPoints, measure);
+ break;
+ case GRAPH_ML:
+ result = new GraphMLClusterWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary, subString);
+ break;
+ case JSON:
+ result = new JsonClusterWriter(writer, clusterIdToPoints, measure, numTopFeatures, dictionary);
+ break;
+ default:
+ throw new IllegalStateException("Unknown outputformat: " + outputFormat);
+ }
+ return result;
+ }
+
+ /**
+ * Convenience function to set the output format during testing.
+ */
+ public void setOutputFormat(OUTPUT_FORMAT of) {
+ outputFormat = of;
+ }
+
+ private void init() {
+ if (this.pointsDir != null) {
+ Configuration conf = new Configuration();
+ // read in the points
+ clusterIdToPoints = readPoints(this.pointsDir, maxPointsPerCluster, conf);
+ } else {
+ clusterIdToPoints = Collections.emptyMap();
+ }
+ }
+
+
+ public int getSubString() {
+ return subString;
+ }
+
+ public void setSubString(int subString) {
+ this.subString = subString;
+ }
+
+ public Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() {
+ return clusterIdToPoints;
+ }
+
+ public String getTermDictionary() {
+ return termDictionary;
+ }
+
+ public void setTermDictionary(String termDictionary, String dictionaryType) {
+ this.termDictionary = termDictionary;
+ this.dictionaryFormat = dictionaryType;
+ }
+
+ public void setNumTopFeatures(int num) {
+ this.numTopFeatures = num;
+ }
+
+ public int getNumTopFeatures() {
+ return this.numTopFeatures;
+ }
+
+ public long getMaxPointsPerCluster() {
+ return maxPointsPerCluster;
+ }
+
+ public void setMaxPointsPerCluster(long maxPointsPerCluster) {
+ this.maxPointsPerCluster = maxPointsPerCluster;
+ }
+
+ public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path pointsPathDir,
+ long maxPointsPerCluster,
+ Configuration conf) {
+ Map<Integer, List<WeightedPropertyVectorWritable>> result = new TreeMap<>();
+ for (Pair<IntWritable, WeightedPropertyVectorWritable> record
+ : new SequenceFileDirIterable<IntWritable, WeightedPropertyVectorWritable>(pointsPathDir, PathType.LIST,
+ PathFilters.logsCRCFilter(), conf)) {
+ // value is the cluster id as an int, key is the name/id of the
+ // vector, but that doesn't matter because we only care about printing it
+ //String clusterId = value.toString();
+ int keyValue = record.getFirst().get();
+ List<WeightedPropertyVectorWritable> pointList = result.get(keyValue);
+ if (pointList == null) {
+ pointList = new ArrayList<>();
+ result.put(keyValue, pointList);
+ }
+ if (pointList.size() < maxPointsPerCluster) {
+ pointList.add(record.getSecond());
+ }
+ }
+ return result;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
new file mode 100644
index 0000000..31858c4
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.clustering;
+
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Implements a {@link ClusterWriter} that outputs in the format used by ClusterDumper in Mahout 0.5
+ */
+public class ClusterDumperWriter extends AbstractClusterWriter {
+
+ private final int subString;
+ private final String[] dictionary;
+ private final int numTopFeatures;
+
+ public ClusterDumperWriter(Writer writer, Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints,
+ DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString) {
+ super(writer, clusterIdToPoints, measure);
+ this.numTopFeatures = numTopFeatures;
+ this.dictionary = dictionary;
+ this.subString = subString;
+ }
+
+ @Override
+ public void write(ClusterWritable clusterWritable) throws IOException {
+ Cluster cluster = clusterWritable.getValue();
+ String fmtStr = cluster.asFormatString(dictionary);
+ Writer writer = getWriter();
+ if (subString > 0 && fmtStr.length() > subString) {
+ writer.write(':');
+ writer.write(fmtStr, 0, Math.min(subString, fmtStr.length()));
+ } else {
+ writer.write(fmtStr);
+ }
+
+ writer.write('\n');
+
+ if (dictionary != null) {
+ String topTerms = getTopFeatures(clusterWritable.getValue().getCenter(), dictionary, numTopFeatures);
+ writer.write("\tTop Terms: ");
+ writer.write(topTerms);
+ writer.write('\n');
+ }
+
+ Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints = getClusterIdToPoints();
+ List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(clusterWritable.getValue().getId());
+ if (points != null) {
+ writer.write("\tWeight : [props - optional]: Point:\n\t");
+ for (Iterator<WeightedPropertyVectorWritable> iterator = points.iterator(); iterator.hasNext();) {
+ WeightedPropertyVectorWritable point = iterator.next();
+ writer.write(String.valueOf(point.getWeight()));
+ Map<Text,Text> map = point.getProperties();
+ // map can be null since empty maps when written are returned as null
+ writer.write(" : [");
+ if (map != null) {
+ for (Map.Entry<Text,Text> entry : map.entrySet()) {
+ writer.write(entry.getKey().toString());
+ writer.write("=");
+ writer.write(entry.getValue().toString());
+ }
+ }
+ writer.write("]");
+
+ writer.write(": ");
+
+ writer.write(AbstractCluster.formatVector(point.getVector(), dictionary));
+ if (iterator.hasNext()) {
+ writer.write("\n\t");
+ }
+ }
+ writer.write('\n');
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
new file mode 100644
index 0000000..70f8f6f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterWriter.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.clustering;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+
+/**
+ * Writes out clusters
+ */
+public interface ClusterWriter extends Closeable {
+
+ /**
+ * Write all values in the Iterable to the output
+ *
+ * @param iterable The {@link Iterable} to loop over
+ * @return the number of docs written
+ * @throws java.io.IOException if there was a problem writing
+ */
+ long write(Iterable<ClusterWritable> iterable) throws IOException;
+
+ /**
+ * Write out a Cluster
+ */
+ void write(ClusterWritable clusterWritable) throws IOException;
+
+ /**
+ * Write the first {@code maxDocs} to the output.
+ *
+ * @param iterable The {@link Iterable} to loop over
+ * @param maxDocs the maximum number of docs to write
+ * @return The number of docs written
+ * @throws IOException if there was a problem writing
+ */
+ long write(Iterable<ClusterWritable> iterable, long maxDocs) throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
new file mode 100644
index 0000000..25e8f3b
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.clustering;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.regex.Pattern;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.StringUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+
+/**
+ * GraphML -- see http://gephi.org/users/supported-graph-formats/graphml-format/
+ */
+public class GraphMLClusterWriter extends AbstractClusterWriter {
+
+ private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
+ private final Map<Integer, Color> colors = new HashMap<>();
+ private Color lastClusterColor;
+ private float lastX;
+ private float lastY;
+ private Random random;
+ private int posStep;
+ private final String[] dictionary;
+ private final int numTopFeatures;
+ private final int subString;
+
+ public GraphMLClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
+ DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString)
+ throws IOException {
+ super(writer, clusterIdToPoints, measure);
+ this.dictionary = dictionary;
+ this.numTopFeatures = numTopFeatures;
+ this.subString = subString;
+ init(writer);
+ }
+
+ private void init(Writer writer) throws IOException {
+ writer.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+ writer.append("<graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\"\n"
+ + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
+ + "xsi:schemaLocation=\"http://graphml.graphdrawing.org/xmlns\n"
+ + "http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd\">");
+ //support rgb
+ writer.append("<key attr.name=\"r\" attr.type=\"int\" for=\"node\" id=\"r\"/>\n"
+ + "<key attr.name=\"g\" attr.type=\"int\" for=\"node\" id=\"g\"/>\n"
+ + "<key attr.name=\"b\" attr.type=\"int\" for=\"node\" id=\"b\"/>"
+ + "<key attr.name=\"size\" attr.type=\"int\" for=\"node\" id=\"size\"/>"
+ + "<key attr.name=\"weight\" attr.type=\"float\" for=\"edge\" id=\"weight\"/>"
+ + "<key attr.name=\"x\" attr.type=\"float\" for=\"node\" id=\"x\"/>"
+ + "<key attr.name=\"y\" attr.type=\"float\" for=\"node\" id=\"y\"/>");
+ writer.append("<graph edgedefault=\"undirected\">");
+ lastClusterColor = new Color();
+ posStep = (int) (0.1 * clusterIdToPoints.size()) + 100;
+ random = RandomUtils.getRandom();
+ }
+
+ /*
+ <?xml version="1.0" encoding="UTF-8"?>
+ <graphml xmlns="http://graphml.graphdrawing.org/xmlns"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns
+ http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">
+ <graph id="G" edgedefault="undirected">
+ <node id="n0"/>
+ <node id="n1"/>
+ <edge id="e1" source="n0" target="n1"/>
+ </graph>
+ </graphml>
+ */
+
+ @Override
+ public void write(ClusterWritable clusterWritable) throws IOException {
+ StringBuilder line = new StringBuilder();
+ Cluster cluster = clusterWritable.getValue();
+ Color rgb = getColor(cluster.getId());
+
+ String topTerms = "";
+ if (dictionary != null) {
+ topTerms = getTopTerms(cluster.getCenter(), dictionary, numTopFeatures);
+ }
+ String clusterLabel = String.valueOf(cluster.getId()) + '_' + topTerms;
+ //do some positioning so that items are visible and grouped together
+ //TODO: put in a real layout algorithm
+ float x = lastX + 1000;
+ float y = lastY;
+ if (x > (1000 + posStep)) {
+ y = lastY + 1000;
+ x = 0;
+ }
+
+ line.append(createNode(clusterLabel, rgb, x, y));
+ List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(cluster.getId());
+ if (points != null) {
+ for (WeightedVectorWritable point : points) {
+ Vector theVec = point.getVector();
+ double distance = 1;
+ if (measure != null) {
+ //scale the distance
+ distance = measure.distance(cluster.getCenter().getLengthSquared(), cluster.getCenter(), theVec) * 500;
+ }
+ String vecStr;
+ int angle = random.nextInt(360); //pick an angle at random and then scale along that angle
+ double angleRads = Math.toRadians(angle);
+
+ float targetX = x + (float) (distance * Math.cos(angleRads));
+ float targetY = y + (float) (distance * Math.sin(angleRads));
+ if (theVec instanceof NamedVector) {
+ vecStr = ((NamedVector) theVec).getName();
+ } else {
+ vecStr = theVec.asFormatString();
+ //do some basic manipulations for display
+ vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
+ }
+ if (subString > 0 && vecStr.length() > subString) {
+ vecStr = vecStr.substring(0, subString);
+ }
+ line.append(createNode(vecStr, rgb, targetX, targetY));
+ line.append(createEdge(clusterLabel, vecStr, distance));
+ }
+ }
+ lastClusterColor = rgb;
+ lastX = x;
+ lastY = y;
+ getWriter().append(line).append("\n");
+ }
+
+ private Color getColor(int clusterId) {
+ Color result = colors.get(clusterId);
+ if (result == null) {
+ result = new Color();
+ //there is probably some better way to color a graph
+ int incR = 0;
+ int incG = 0;
+ int incB = 0;
+ if (lastClusterColor.r + 20 < 256 && lastClusterColor.g + 20 < 256 && lastClusterColor.b + 20 < 256) {
+ incR = 20;
+ incG = 0;
+ incB = 0;
+ } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 < 256 && lastClusterColor.b + 20 < 256) {
+ incG = 20;
+ incB = 0;
+ } else if (lastClusterColor.r + 20 >= 256 && lastClusterColor.g + 20 >= 256 && lastClusterColor.b + 20 < 256) {
+ incB = 20;
+ } else {
+ incR += 3;
+ incG += 3;
+ incR += 3;
+ }
+ result.r = (lastClusterColor.r + incR) % 256;
+ result.g = (lastClusterColor.g + incG) % 256;
+ result.b = (lastClusterColor.b + incB) % 256;
+ colors.put(clusterId, result);
+ }
+ return result;
+ }
+
+ private static String createEdge(String left, String right, double distance) {
+ left = StringUtils.escapeXML(left);
+ right = StringUtils.escapeXML(right);
+ return "<edge id=\"" + left + '_' + right + "\" source=\"" + left + "\" target=\"" + right + "\">"
+ + "<data key=\"weight\">" + distance + "</data></edge>";
+ }
+
+ private static String createNode(String s, Color rgb, float x, float y) {
+ return "<node id=\"" + StringUtils.escapeXML(s) + "\"><data key=\"r\">" + rgb.r
+ + "</data>"
+ + "<data key=\"g\">" + rgb.g
+ + "</data>"
+ + "<data key=\"b\">" + rgb.b
+ + "</data>"
+ + "<data key=\"x\">" + x
+ + "</data>"
+ + "<data key=\"y\">" + y
+ + "</data>"
+ + "</node>";
+ }
+
+ @Override
+ public void close() throws IOException {
+ getWriter().append("</graph>").append("</graphml>");
+ super.close();
+ }
+
+ private static class Color {
+ int r;
+ int g;
+ int b;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
new file mode 100644
index 0000000..d564a73
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils.clustering;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Dump cluster info to JSON formatted lines. Heavily inspired by
+ * ClusterDumperWriter.java and CSVClusterWriter.java
+ *
+ */
+public class JsonClusterWriter extends AbstractClusterWriter {
+ private final String[] dictionary;
+ private final int numTopFeatures;
+ private final ObjectMapper jxn;
+
+ private static final Logger log = LoggerFactory.getLogger(JsonClusterWriter.class);
+ private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
+
+ public JsonClusterWriter(Writer writer,
+ Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
+ DistanceMeasure measure, int numTopFeatures, String[] dictionary) {
+ super(writer, clusterIdToPoints, measure);
+ this.numTopFeatures = numTopFeatures;
+ this.dictionary = dictionary;
+ jxn = new ObjectMapper();
+ }
+
+ /**
+ * Generate HashMap with cluster info and write as a single JSON formatted
+ * line
+ */
+ @Override
+ public void write(ClusterWritable clusterWritable) throws IOException {
+ Map<String, Object> res = new HashMap<>();
+
+ // get top terms
+ if (dictionary != null) {
+ List<Object> topTerms = getTopFeaturesList(clusterWritable.getValue()
+ .getCenter(), dictionary, numTopFeatures);
+ res.put("top_terms", topTerms);
+ } else {
+ res.put("top_terms", new ArrayList<>());
+ }
+
+ // get human-readable cluster representation
+ Cluster cluster = clusterWritable.getValue();
+ res.put("cluster_id", cluster.getId());
+
+ if (dictionary != null) {
+ Map<String,Object> fmtStr = cluster.asJson(dictionary);
+ res.put("cluster", fmtStr);
+
+ // get points
+ List<Object> points = getPoints(cluster, dictionary);
+ res.put("points", points);
+ } else {
+ res.put("cluster", new HashMap<>());
+ res.put("points", new ArrayList<>());
+ }
+
+ // write JSON
+ Writer writer = getWriter();
+ writer.write(jxn.writeValueAsString(res) + "\n");
+ }
+
+ /**
+ * Create a List of HashMaps containing top terms information
+ *
+ * @return List<Object>
+ */
+ public List<Object> getTopFeaturesList(Vector vector, String[] dictionary,
+ int numTerms) {
+
+ List<TermIndexWeight> vectorTerms = new ArrayList<>();
+
+ for (Vector.Element elt : vector.nonZeroes()) {
+ vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
+ }
+
+ // Sort results in reverse order (i.e. weight in descending order)
+ Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
+ @Override
+ public int compare(TermIndexWeight one, TermIndexWeight two) {
+ return Double.compare(two.weight, one.weight);
+ }
+ });
+
+ List<Object> topTerms = new ArrayList<>();
+
+ for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
+ int index = vectorTerms.get(i).index;
+ String dictTerm = dictionary[index];
+ if (dictTerm == null) {
+ log.error("Dictionary entry missing for {}", index);
+ continue;
+ }
+ Map<String, Object> term_entry = new HashMap<>();
+ term_entry.put(dictTerm, vectorTerms.get(i).weight);
+ topTerms.add(term_entry);
+ }
+
+ return topTerms;
+ }
+
+ /**
+ * Create a List of HashMaps containing Vector point information
+ *
+ * @return List<Object>
+ */
+ public List<Object> getPoints(Cluster cluster, String[] dictionary) {
+ List<Object> vectorObjs = new ArrayList<>();
+ List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(
+ cluster.getId());
+
+ if (points != null) {
+ for (WeightedPropertyVectorWritable point : points) {
+ Map<String, Object> entry = new HashMap<>();
+ Vector theVec = point.getVector();
+ if (theVec instanceof NamedVector) {
+ entry.put("vector_name", ((NamedVector) theVec).getName());
+ } else {
+ String vecStr = theVec.asFormatString();
+ // do some basic manipulations for display
+ vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
+ entry.put("vector_name", vecStr);
+ }
+ entry.put("weight", String.valueOf(point.getWeight()));
+ try {
+ entry.put("point",
+ AbstractCluster.formatVectorAsJson(point.getVector(), dictionary));
+ } catch (IOException e) {
+ log.error("IOException: ", e);
+ }
+ vectorObjs.add(entry);
+ }
+ }
+ return vectorObjs;
+ }
+
+ /**
+ * Convenience class for sorting terms
+ *
+ */
+ private static class TermIndexWeight {
+ private final int index;
+ private final double weight;
+
+ TermIndexWeight(int index, double weight) {
+ this.index = index;
+ this.weight = weight;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
new file mode 100644
index 0000000..54ad43f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.email;
+
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * Configuration options to be used by {@link MailProcessor}. Includes options controlling the exact output format
+ * and which mail fields are included (body, to, from, subject, etc.)
+ */
+public class MailOptions {
+
+ public static final String FROM = "FROM";
+ public static final String TO = "TO";
+ public static final String REFS = "REFS";
+ public static final String SUBJECT = "SUBJECT";
+ public static final Pattern DEFAULT_QUOTED_TEXT = Pattern.compile("^(\\||>)");
+
+ private boolean stripQuotedText;
+ private File input;
+ private String outputDir;
+ private String prefix;
+ private int chunkSize;
+ private Charset charset;
+ private String separator;
+ private String bodySeparator = "\n";
+ private boolean includeBody;
+ private Pattern[] patternsToMatch;
+ //maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in patternsToMatch. See MailToRecMapper
+ private Map<String, Integer> patternOrder;
+
+ //the regular expression to use for identifying quoted text.
+ private Pattern quotedTextPattern = DEFAULT_QUOTED_TEXT;
+
+ public File getInput() {
+ return input;
+ }
+
+ public void setInput(File input) {
+ this.input = input;
+ }
+
+ public String getOutputDir() {
+ return outputDir;
+ }
+
+ /**
+ * Sets the output directory where sequence files will be written.
+ */
+ public void setOutputDir(String outputDir) {
+ this.outputDir = outputDir;
+ }
+
+ public String getPrefix() {
+ return prefix;
+ }
+
+ /**
+ * Sets the prefix that is combined with the archive name and with message ids to create {@code SequenceFile} keys.
+ * @param prefix The name of the directory containing the mail archive is commonly used.
+ */
+ public void setPrefix(String prefix) {
+ this.prefix = prefix;
+ }
+
+ public int getChunkSize() {
+ return chunkSize;
+ }
+
+ /**
+ * Sets the size of each generated sequence file, in Megabytes.
+ */
+ public void setChunkSize(int chunkSize) {
+ this.chunkSize = chunkSize;
+ }
+
+ public Charset getCharset() {
+ return charset;
+ }
+
+ /**
+ * Sets the encoding of the input
+ */
+ public void setCharset(Charset charset) {
+ this.charset = charset;
+ }
+
+ public String getSeparator() {
+ return separator;
+ }
+
+ /**
+ * Sets the separator to use in the output between metadata items (to, from, etc.).
+ */
+ public void setSeparator(String separator) {
+ this.separator = separator;
+ }
+
+ public String getBodySeparator() {
+ return bodySeparator;
+ }
+
+ /**
+ * Sets the separator to use in the output between lines in the body, the default is "\n".
+ */
+ public void setBodySeparator(String bodySeparator) {
+ this.bodySeparator = bodySeparator;
+ }
+
+ public boolean isIncludeBody() {
+ return includeBody;
+ }
+
+ /**
+ * Sets whether mail bodies are included in the output
+ */
+ public void setIncludeBody(boolean includeBody) {
+ this.includeBody = includeBody;
+ }
+
+ public Pattern[] getPatternsToMatch() {
+ return patternsToMatch;
+ }
+
+ /**
+ * Sets the list of patterns to be applied in the given order to extract metadata fields (to, from, subject, etc.)
+ * from the input
+ */
+ public void setPatternsToMatch(Pattern[] patternsToMatch) {
+ this.patternsToMatch = patternsToMatch;
+ }
+
+ public Map<String, Integer> getPatternOrder() {
+ return patternOrder;
+ }
+
+ public void setPatternOrder(Map<String, Integer> patternOrder) {
+ this.patternOrder = patternOrder;
+ }
+
+ /**
+ *
+ * @return true if we should strip out quoted email text
+ */
+ public boolean isStripQuotedText() {
+ return stripQuotedText;
+ }
+
+ /**
+ *
+ * Sets whether quoted text such as lines starting with | or > is striped off.
+ */
+ public void setStripQuotedText(boolean stripQuotedText) {
+ this.stripQuotedText = stripQuotedText;
+ }
+
+ public Pattern getQuotedTextPattern() {
+ return quotedTextPattern;
+ }
+
+ /**
+ * Sets the {@link java.util.regex.Pattern} to use to identify lines that are quoted text. Default is | and >
+ * @see #setStripQuotedText(boolean)
+ */
+ public void setQuotedTextPattern(Pattern quotedTextPattern) {
+ this.quotedTextPattern = quotedTextPattern;
+ }
+}

r***@apache.org

2018-06-27 14:52:13 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java
new file mode 100644
index 0000000..203e8fb
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchivesMapper.java
@@ -0,0 +1,244 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.apache.mahout.utils.email.MailOptions;
+import org.apache.mahout.utils.email.MailProcessor;
+
+import java.io.ByteArrayInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_SEPARATOR_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.CHARSET_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.CHUNK_SIZE_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.FROM_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.KEY_PREFIX_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.QUOTED_REGEX_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.REFERENCES_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.SEPARATOR_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.STRIP_QUOTED_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.SUBJECT_OPTION;
+import static org.apache.mahout.text.SequenceFilesFromMailArchives.TO_OPTION;
+
+/**
+ * Map Class for the SequenceFilesFromMailArchives job
+ */
+public class SequenceFilesFromMailArchivesMapper extends Mapper<IntWritable, BytesWritable, Text, Text> {
+
+ private Text outKey = new Text();
+ private Text outValue = new Text();
+
+ private static final Pattern MESSAGE_START = Pattern.compile(
+ "^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern MESSAGE_ID_PREFIX = Pattern.compile(
+ "^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);
+
+ private MailOptions options;
+
+ @Override
+ public void setup(Context context) throws IOException, InterruptedException {
+
+ Configuration configuration = context.getConfiguration();
+
+ // absorb all of the options into the MailOptions object
+ this.options = new MailOptions();
+
+ options.setPrefix(configuration.get(KEY_PREFIX_OPTION[1], ""));
+
+ if (!configuration.get(CHUNK_SIZE_OPTION[0], "").equals("")) {
+ options.setChunkSize(configuration.getInt(CHUNK_SIZE_OPTION[0], 64));
+ }
+
+ if (!configuration.get(CHARSET_OPTION[0], "").equals("")) {
+ Charset charset = Charset.forName(configuration.get(CHARSET_OPTION[0], "UTF-8"));
+ options.setCharset(charset);
+ } else {
+ Charset charset = Charset.forName("UTF-8");
+ options.setCharset(charset);
+ }
+
+ List<Pattern> patterns = Lists.newArrayListWithCapacity(5);
+ // patternOrder is used downstream so that we can know what order the
+ // text is in instead
+ // of encoding it in the string, which
+ // would require more processing later to remove it pre feature
+ // selection.
+ Map<String, Integer> patternOrder = Maps.newHashMap();
+ int order = 0;
+ if (!configuration.get(FROM_OPTION[1], "").equals("")) {
+ patterns.add(MailProcessor.FROM_PREFIX);
+ patternOrder.put(MailOptions.FROM, order++);
+ }
+
+ if (!configuration.get(TO_OPTION[1], "").equals("")) {
+ patterns.add(MailProcessor.TO_PREFIX);
+ patternOrder.put(MailOptions.TO, order++);
+ }
+
+ if (!configuration.get(REFERENCES_OPTION[1], "").equals("")) {
+ patterns.add(MailProcessor.REFS_PREFIX);
+ patternOrder.put(MailOptions.REFS, order++);
+ }
+
+ if (!configuration.get(SUBJECT_OPTION[1], "").equals("")) {
+ patterns.add(MailProcessor.SUBJECT_PREFIX);
+ patternOrder.put(MailOptions.SUBJECT, order += 1);
+ }
+
+ options.setStripQuotedText(configuration.getBoolean(STRIP_QUOTED_OPTION[1], false));
+
+ options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()]));
+ options.setPatternOrder(patternOrder);
+
+ options.setIncludeBody(configuration.getBoolean(BODY_OPTION[1], false));
+
+ options.setSeparator("\n");
+ if (!configuration.get(SEPARATOR_OPTION[1], "").equals("")) {
+ options.setSeparator(configuration.get(SEPARATOR_OPTION[1], ""));
+ }
+ if (!configuration.get(BODY_SEPARATOR_OPTION[1], "").equals("")) {
+ options.setBodySeparator(configuration.get(BODY_SEPARATOR_OPTION[1], ""));
+ }
+ if (!configuration.get(QUOTED_REGEX_OPTION[1], "").equals("")) {
+ options.setQuotedTextPattern(Pattern.compile(configuration.get(QUOTED_REGEX_OPTION[1], "")));
+ }
+
+ }
+
+ public long parseMailboxLineByLine(String filename, InputStream mailBoxInputStream, Context context)
+ throws IOException, InterruptedException {
+ long messageCount = 0;
+ try {
+ StringBuilder contents = new StringBuilder();
+ StringBuilder body = new StringBuilder();
+ Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
+ Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
+ String[] patternResults = new String[options.getPatternsToMatch().length];
+ Matcher[] matches = new Matcher[options.getPatternsToMatch().length];
+ for (int i = 0; i < matches.length; i++) {
+ matches[i] = options.getPatternsToMatch()[i].matcher("");
+ }
+
+ String messageId = null;
+ boolean inBody = false;
+ Pattern quotedTextPattern = options.getQuotedTextPattern();
+
+ for (String nextLine : new FileLineIterable(mailBoxInputStream, options.getCharset(), false, filename)) {
+ if (!options.isStripQuotedText() || !quotedTextPattern.matcher(nextLine).find()) {
+ for (int i = 0; i < matches.length; i++) {
+ Matcher matcher = matches[i];
+ matcher.reset(nextLine);
+ if (matcher.matches()) {
+ patternResults[i] = matcher.group(1);
+ }
+ }
+
+ // only start appending body content after we've seen a message ID
+ if (messageId != null) {
+ // first, see if we hit the end of the message
+ messageBoundaryMatcher.reset(nextLine);
+ if (messageBoundaryMatcher.matches()) {
+ // done parsing this message ... write it out
+ String key = generateKey(filename, options.getPrefix(), messageId);
+ // if this ordering changes, then also change
+ // FromEmailToDictionaryMapper
+ writeContent(options.getSeparator(), contents, body, patternResults);
+
+ this.outKey.set(key);
+ this.outValue.set(contents.toString());
+ context.write(this.outKey, this.outValue);
+ contents.setLength(0); // reset the buffer
+ body.setLength(0);
+ messageId = null;
+ inBody = false;
+ } else {
+ if (inBody && options.isIncludeBody()) {
+ if (!nextLine.isEmpty()) {
+ body.append(nextLine).append(options.getBodySeparator());
+ }
+ } else {
+ // first empty line we see after reading the message Id
+ // indicates that we are in the body ...
+ inBody = nextLine.isEmpty();
+ }
+ }
+ } else {
+ if (nextLine.length() > 14) {
+ messageIdMatcher.reset(nextLine);
+ if (messageIdMatcher.matches()) {
+ messageId = messageIdMatcher.group(1);
+ ++messageCount;
+ }
+ }
+ }
+ }
+ }
+ // write the last message in the file if available
+ if (messageId != null) {
+ String key = generateKey(filename, options.getPrefix(), messageId);
+ writeContent(options.getSeparator(), contents, body, patternResults);
+ this.outKey.set(key);
+ this.outValue.set(contents.toString());
+ context.write(this.outKey, this.outValue);
+ contents.setLength(0); // reset the buffer
+ }
+ } catch (FileNotFoundException ignored) {
+
+ }
+ return messageCount;
+ }
+
+ protected static String generateKey(String mboxFilename, String prefix, String messageId) {
+ return Joiner.on(Path.SEPARATOR).join(Lists.newArrayList(prefix, mboxFilename, messageId).iterator());
+ }
+
+ private static void writeContent(String separator, StringBuilder contents, CharSequence body, String[] matches) {
+ String matchesString = Joiner.on(separator).useForNull("").join(Arrays.asList(matches).iterator());
+ contents.append(matchesString).append(separator).append(body);
+ }
+
+ public void map(IntWritable key, BytesWritable value, Context context)
+ throws IOException, InterruptedException {
+ Configuration configuration = context.getConfiguration();
+ Path filePath = ((CombineFileSplit) context.getInputSplit()).getPath(key.get());
+ String relativeFilePath = HadoopUtil.calcRelativeFilePath(configuration, filePath);
+ ByteArrayInputStream is = new ByteArrayInputStream(value.getBytes());
+ parseMailboxLineByLine(relativeFilePath, is, context);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java
new file mode 100644
index 0000000..cacfd22
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+
+import java.io.IOException;
+
+public class TextParagraphSplittingJob extends AbstractJob {
+
+ @Override
+ public int run(String[] strings) throws Exception {
+ Configuration originalConf = getConf();
+ Job job = prepareJob(new Path(originalConf.get("mapred.input.dir")),
+ new Path(originalConf.get("mapred.output.dir")),
+ SequenceFileInputFormat.class,
+ SplitMap.class,
+ Text.class,
+ Text.class,
+ Reducer.class,
+ Text.class,
+ Text.class,
+ SequenceFileOutputFormat.class);
+ job.setNumReduceTasks(0);
+ boolean succeeded = job.waitForCompletion(true);
+ return succeeded ? 0 : -1;
+ }
+
+ public static class SplitMap extends Mapper<Text,Text,Text,Text> {
+
+ @Override
+ protected void map(Text key, Text text, Context context) throws IOException, InterruptedException {
+ Text outText = new Text();
+ int loc = 0;
+ while (loc >= 0 && loc < text.getLength()) {
+ int nextLoc = text.find("\n\n", loc + 1);
+ if (nextLoc > 0) {
+ outText.set(text.getBytes(), loc, nextLoc - loc);
+ context.write(key, outText);
+ }
+ loc = nextLoc;
+ }
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new TextParagraphSplittingJob(), args);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java
new file mode 100644
index 0000000..b8441b7
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WholeFileRecordReader.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import java.io.IOException;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+
+import static org.apache.mahout.text.SequenceFilesFromDirectory.FILE_FILTER_CLASS_OPTION;
+
+/**
+ * RecordReader used with the MultipleTextFileInputFormat class to read full files as
+ * k/v pairs and groups of files as single input splits.
+ */
+public class WholeFileRecordReader extends RecordReader<IntWritable, BytesWritable> {
+
+ private FileSplit fileSplit;
+ private boolean processed = false;
+ private Configuration configuration;
+ private BytesWritable value = new BytesWritable();
+ private IntWritable index;
+ private String fileFilterClassName = null;
+ private PathFilter pathFilter = null;
+
+ public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext taskAttemptContext, Integer idx)
+ throws IOException {
+ this.fileSplit = new FileSplit(fileSplit.getPath(idx), fileSplit.getOffset(idx),
+ fileSplit.getLength(idx), fileSplit.getLocations());
+ this.configuration = taskAttemptContext.getConfiguration();
+ this.index = new IntWritable(idx);
+ this.fileFilterClassName = this.configuration.get(FILE_FILTER_CLASS_OPTION[0]);
+ }
+
+ @Override
+ public IntWritable getCurrentKey() {
+ return index;
+ }
+
+ @Override
+ public BytesWritable getCurrentValue() {
+ return value;
+ }
+
+ @Override
+ public float getProgress() throws IOException {
+ return processed ? 1.0f : 0.0f;
+ }
+
+ @Override
+ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
+ throws IOException, InterruptedException {
+ if (!StringUtils.isBlank(fileFilterClassName) &&
+ !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
+ try {
+ pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance();
+ } catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException {
+ if (!processed) {
+ byte[] contents = new byte[(int) fileSplit.getLength()];
+ Path file = fileSplit.getPath();
+ FileSystem fs = file.getFileSystem(this.configuration);
+
+ if (!fs.isFile(file)) {
+ return false;
+ }
+
+ FileStatus[] fileStatuses;
+ if (pathFilter != null) {
+ fileStatuses = fs.listStatus(file, pathFilter);
+ } else {
+ fileStatuses = fs.listStatus(file);
+ }
+
+ if (fileStatuses.length == 1) {
+ try (FSDataInputStream in = fs.open(fileStatuses[0].getPath())) {
+ IOUtils.readFully(in, contents, 0, contents.length);
+ value.setCapacity(contents.length);
+ value.set(contents, 0, contents.length);
+ }
+ processed = true;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public void close() throws IOException {
+ }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
new file mode 100644
index 0000000..bed4640
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Stringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.apache.mahout.text.wikipedia.WikipediaMapper;
+import org.apache.mahout.text.wikipedia.XmlInputFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Create and run the Wikipedia Dataset Creator.
+ */
+public final class WikipediaToSequenceFile {
+
+ private static final Logger log = LoggerFactory.getLogger(WikipediaToSequenceFile.class);
+
+ private WikipediaToSequenceFile() { }
+
+ /**
+ * Takes in two arguments:
+ * <ol>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+ * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
+ * {@link org.apache.hadoop.io.SequenceFile}</li>
+ * </ol>
+ */
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
+
+ Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();
+
+ Option categoriesOpt = obuilder.withLongName("categories").withArgument(
+ abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Location of the categories file. One entry per line. "
+ + "Will be used to make a string match in Wikipedia Category field").withShortName("c").create();
+
+ Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
+ "If set, then the category name must exactly match the "
+ + "entry in the categories file. Default is false").withShortName("e").create();
+
+ Option allOpt = obuilder.withLongName("all")
+ .withDescription("If set, Select all files. Default is false").withShortName("all").create();
+
+ Option removeLabelOpt = obuilder.withLongName("removeLabels")
+ .withDescription("If set, remove [[Category:labels]] from document text after extracting label."
+ + "Default is false").withShortName("rl").create();
+
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
+ .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
+ .withOption(removeLabelOpt).create();
+
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ parser.setHelpOption(helpOpt);
+ try {
+ CommandLine cmdLine = parser.parse(args);
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
+ String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
+
+ String catFile = "";
+ if (cmdLine.hasOption(categoriesOpt)) {
+ catFile = (String) cmdLine.getValue(categoriesOpt);
+ }
+
+ boolean all = false;
+ if (cmdLine.hasOption(allOpt)) {
+ all = true;
+ }
+
+ boolean removeLabels = false;
+ if (cmdLine.hasOption(removeLabelOpt)) {
+ removeLabels = true;
+ }
+
+ runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all, removeLabels);
+ } catch (OptionException | InterruptedException | ClassNotFoundException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+
+ /**
+ * Run the job
+ *
+ * @param input
+ * the input pathname String
+ * @param output
+ * the output pathname String
+ * @param catFile
+ * the file containing the Wikipedia categories
+ * @param exactMatchOnly
+ * if true, then the Wikipedia category must match exactly instead of simply containing the
+ * category string
+ * @param all
+ * if true select all categories
+ * @param removeLabels
+ * if true remove Category labels from document text after extracting.
+ *
+ */
+ public static void runJob(String input,
+ String output,
+ String catFile,
+ boolean exactMatchOnly,
+ boolean all,
+ boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException {
+ Configuration conf = new Configuration();
+ conf.set("xmlinput.start", "<page>");
+ conf.set("xmlinput.end", "</page>");
+ conf.setBoolean("exact.match.only", exactMatchOnly);
+ conf.setBoolean("all.files", all);
+ conf.setBoolean("remove.labels", removeLabels);
+ conf.set("io.serializations",
+ "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
+
+ Set<String> categories = new HashSet<>();
+ if (!catFile.isEmpty()) {
+ for (String line : new FileLineIterable(new File(catFile))) {
+ categories.add(line.trim().toLowerCase(Locale.ENGLISH));
+ }
+ }
+
+ Stringifier<Set<String>> setStringifier =
+ new DefaultStringifier<>(conf, GenericsUtil.getClass(categories));
+
+ String categoriesStr = setStringifier.toString(categories);
+ conf.set("wikipedia.categories", categoriesStr);
+
+ Job job = new Job(conf);
+ log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, catFile, all);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(Text.class);
+ FileInputFormat.setInputPaths(job, new Path(input));
+ Path outPath = new Path(output);
+ FileOutputFormat.setOutputPath(job, outPath);
+ job.setMapperClass(WikipediaMapper.class);
+ job.setInputFormatClass(XmlInputFormat.class);
+ job.setReducerClass(Reducer.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setJarByClass(WikipediaToSequenceFile.class);
+
+ /*
+ * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
+ * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
+ * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
+ */
+ HadoopUtil.delete(conf, outPath);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
new file mode 100644
index 0000000..d50323d
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text.wikipedia;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopAnalyzer;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
+
+
+public class WikipediaAnalyzer extends StopwordAnalyzerBase {
+
+ public WikipediaAnalyzer() {
+ super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+ }
+
+ public WikipediaAnalyzer(CharArraySet stopSet) {
+ super(stopSet);
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new WikipediaTokenizer();
+ TokenStream result = new StandardFilter(tokenizer);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(result, getStopwordSet());
+ return new TokenStreamComponents(tokenizer, result);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
new file mode 100644
index 0000000..8214407
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text.wikipedia;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Stringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Create and run the Wikipedia Dataset Creator.
+ */
+public final class WikipediaDatasetCreatorDriver {
+ private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorDriver.class);
+
+ private WikipediaDatasetCreatorDriver() { }
+
+ /**
+ * Takes in two arguments:
+ * <ol>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+ * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
+ * {@link org.apache.hadoop.io.SequenceFile}</li>
+ * </ol>
+ */
+ public static void main(String[] args) throws IOException, InterruptedException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
+
+ Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();
+
+ Option categoriesOpt = obuilder.withLongName("categories").withRequired(true).withArgument(
+ abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Location of the categories file. One entry per line. "
+ + "Will be used to make a string match in Wikipedia Category field").withShortName("c").create();
+
+ Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
+ "If set, then the category name must exactly match the "
+ + "entry in the categories file. Default is false").withShortName("e").create();
+ Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false).withArgument(
+ abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The analyzer to use, must have a no argument constructor").withShortName("a").create();
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
+ .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ try {
+ CommandLine cmdLine = parser.parse(args);
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
+ String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
+ String catFile = (String) cmdLine.getValue(categoriesOpt);
+ Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
+ if (cmdLine.hasOption(analyzerOpt)) {
+ String className = cmdLine.getValue(analyzerOpt).toString();
+ analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+ // try instantiating it, b/c there isn't any point in setting it if
+ // you can't instantiate it
+ ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
+ }
+ runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt),
+ analyzerClass);
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ } catch (ClassNotFoundException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+
+ /**
+ * Run the job
+ *
+ * @param input
+ * the input pathname String
+ * @param output
+ * the output pathname String
+ * @param catFile
+ * the file containing the Wikipedia categories
+ * @param exactMatchOnly
+ * if true, then the Wikipedia category must match exactly instead of simply containing the
+ * category string
+ */
+ public static void runJob(String input,
+ String output,
+ String catFile,
+ boolean exactMatchOnly,
+ Class<? extends Analyzer> analyzerClass)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ Configuration conf = new Configuration();
+ conf.set("key.value.separator.in.input.line", " ");
+ conf.set("xmlinput.start", "<page>");
+ conf.set("xmlinput.end", "</page>");
+ conf.setBoolean("exact.match.only", exactMatchOnly);
+ conf.set("analyzer.class", analyzerClass.getName());
+ conf.set("io.serializations",
+ "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
+ // Dont ever forget this. People should keep track of how hadoop conf
+ // parameters can make or break a piece of code
+
+ Set<String> categories = new HashSet<>();
+ for (String line : new FileLineIterable(new File(catFile))) {
+ categories.add(line.trim().toLowerCase(Locale.ENGLISH));
+ }
+
+ Stringifier<Set<String>> setStringifier =
+ new DefaultStringifier<>(conf, GenericsUtil.getClass(categories));
+
+ String categoriesStr = setStringifier.toString(categories);
+
+ conf.set("wikipedia.categories", categoriesStr);
+
+ Job job = new Job(conf);
+ log.info("Input: {} Out: {} Categories: {}", input, output, catFile);
+ job.setJarByClass(WikipediaDatasetCreatorDriver.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(Text.class);
+ job.setMapperClass(WikipediaDatasetCreatorMapper.class);
+ //TODO: job.setNumMapTasks(100);
+ job.setInputFormatClass(XmlInputFormat.class);
+ job.setReducerClass(WikipediaDatasetCreatorReducer.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+
+ FileInputFormat.setInputPaths(job, new Path(input));
+ Path outPath = new Path(output);
+ FileOutputFormat.setOutputPath(job, outPath);
+ HadoopUtil.delete(conf, outPath);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
new file mode 100644
index 0000000..50e5f37
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text.wikipedia;
+
+import com.google.common.io.Closeables;
+import org.apache.commons.lang3.StringEscapeUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.mahout.common.ClassUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Maps over Wikipedia xml format and output all document having the category listed in the input category
+ * file
+ *
+ */
+public class WikipediaDatasetCreatorMapper extends Mapper<LongWritable, Text, Text, Text> {
+
+ private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
+
+ private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
+ private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
+ private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
+
+ private List<String> inputCategories;
+ private List<Pattern> inputCategoryPatterns;
+ private boolean exactMatchOnly;
+ private Analyzer analyzer;
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+ String document = value.toString();
+ document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN.matcher(
+ OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
+ String catMatch = findMatchingCategory(document);
+ if (!"Unknown".equals(catMatch)) {
+ StringBuilder contents = new StringBuilder(1000);
+ TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
+ CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
+ stream.reset();
+ while (stream.incrementToken()) {
+ contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
+ }
+ context.write(
+ new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
+ new Text(contents.toString()));
+ stream.end();
+ Closeables.close(stream, true);
+ }
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+
+ Configuration conf = context.getConfiguration();
+
+ if (inputCategories == null) {
+ Set<String> newCategories = new HashSet<>();
+ DefaultStringifier<Set<String>> setStringifier =
+ new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));
+ String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories));
+ Set<String> inputCategoriesSet = setStringifier.fromString(categoriesStr);
+ inputCategories = new ArrayList<>(inputCategoriesSet);
+ inputCategoryPatterns = new ArrayList<>(inputCategories.size());
+ for (String inputCategory : inputCategories) {
+ inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + "\\b.*"));
+ }
+
+ }
+
+ exactMatchOnly = conf.getBoolean("exact.match.only", false);
+
+ if (analyzer == null) {
+ String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName());
+ analyzer = ClassUtils.instantiateAs(analyzerStr, Analyzer.class);
+ }
+
+ log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}",
+ inputCategories.size(), exactMatchOnly, analyzer.getClass().getName());
+ }
+
+ private String findMatchingCategory(String document) {
+ int startIndex = 0;
+ int categoryIndex;
+ while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
+ categoryIndex += 11;
+ int endIndex = document.indexOf("]]", categoryIndex);
+ if (endIndex >= document.length() || endIndex < 0) {
+ break;
+ }
+ String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
+ // categories.add(category.toLowerCase());
+ if (exactMatchOnly && inputCategories.contains(category)) {
+ return category;
+ }
+ if (!exactMatchOnly) {
+ for (int i = 0; i < inputCategories.size(); i++) {
+ String inputCategory = inputCategories.get(i);
+ Pattern inputCategoryPattern = inputCategoryPatterns.get(i);
+ if (inputCategoryPattern.matcher(category).matches()) { // inexact match with word boundary.
+ return inputCategory;
+ }
+ }
+ }
+ startIndex = endIndex;
+ }
+ return "Unknown";
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
new file mode 100644
index 0000000..bf921fc
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text.wikipedia;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+/**
+ * Can also be used as a local Combiner
+ */
+public class WikipediaDatasetCreatorReducer extends Reducer<Text, Text, Text, Text> {
+
+ @Override
+ protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
+ // Key is label,word, value is the number of times we've seen this label
+ // word per local node. Output is the same
+ for (Text value : values) {
+ context.write(key, value);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
new file mode 100644
index 0000000..abd3a04
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text.wikipedia;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringEscapeUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.util.GenericsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Maps over Wikipedia xml format and output all document having the category listed in the input category
+ * file
+ *
+ */
+public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
+
+ private static final Logger log = LoggerFactory.getLogger(WikipediaMapper.class);
+
+ private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s]");
+
+ private static final String START_DOC = "<text xml:space=\"preserve\">";
+
+ private static final String END_DOC = "</text>";
+
+ private static final Pattern TITLE = Pattern.compile("<title>(.*)<\\/title>");
+
+ private static final String REDIRECT = "<redirect />";
+
+ private Set<String> inputCategories;
+
+ private boolean exactMatchOnly;
+
+ private boolean all;
+
+ private boolean removeLabels;
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+
+ String content = value.toString();
+ if (content.contains(REDIRECT)) {
+ return;
+ }
+ String document;
+ String title;
+ try {
+ document = getDocument(content);
+ title = getTitle(content);
+ } catch (RuntimeException e) {
+ // TODO: reporter.getCounter("Wikipedia", "Parse errors").increment(1);
+ return;
+ }
+
+ String catMatch = findMatchingCategory(document);
+ if (!all) {
+ if ("Unknown".equals(catMatch)) {
+ return;
+ }
+ }
+
+ document = StringEscapeUtils.unescapeHtml4(document);
+ if (removeLabels) {
+ document = removeCategoriesFromText(document);
+ // Reject documents with malformed tags
+ if (document == null) {
+ return;
+ }
+ }
+
+ // write out in Bayes input style: key: /Category/document_name
+ String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
+ SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
+
+ context.write(new Text(category), new Text(document));
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Configuration conf = context.getConfiguration();
+
+ Set<String> newCategories = new HashSet<>();
+ DefaultStringifier<Set<String>> setStringifier =
+ new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));
+
+ String categoriesStr = conf.get("wikipedia.categories");
+ inputCategories = setStringifier.fromString(categoriesStr);
+ exactMatchOnly = conf.getBoolean("exact.match.only", false);
+ all = conf.getBoolean("all.files", false);
+ removeLabels = conf.getBoolean("remove.labels",false);
+ log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}",
+ inputCategories.size(), all, exactMatchOnly, removeLabels);
+ }
+
+ private static String getDocument(String xml) {
+ int start = xml.indexOf(START_DOC) + START_DOC.length();
+ int end = xml.indexOf(END_DOC, start);
+ return xml.substring(start, end);
+ }
+
+ private static String getTitle(CharSequence xml) {
+ Matcher m = TITLE.matcher(xml);
+ return m.find() ? m.group(1) : "";
+ }
+
+ private String findMatchingCategory(String document) {
+ int startIndex = 0;
+ int categoryIndex;
+ while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
+ categoryIndex += 11;
+ int endIndex = document.indexOf("]]", categoryIndex);
+ if (endIndex >= document.length() || endIndex < 0) {
+ break;
+ }
+ String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
+ if (exactMatchOnly && inputCategories.contains(category)) {
+ return category.toLowerCase(Locale.ENGLISH);
+ }
+ if (!exactMatchOnly) {
+ for (String inputCategory : inputCategories) {
+ if (category.contains(inputCategory)) { // we have an inexact match
+ return inputCategory.toLowerCase(Locale.ENGLISH);
+ }
+ }
+ }
+ startIndex = endIndex;
+ }
+ return "Unknown";
+ }
+
+ private String removeCategoriesFromText(String document) {
+ int startIndex = 0;
+ int categoryIndex;
+ try {
+ while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
+ int endIndex = document.indexOf("]]", categoryIndex);
+ if (endIndex >= document.length() || endIndex < 0) {
+ break;
+ }
+ document = document.replace(document.substring(categoryIndex, endIndex + 2), "");
+ if (categoryIndex < document.length()) {
+ startIndex = categoryIndex;
+ } else {
+ break;
+ }
+ }
+ } catch(StringIndexOutOfBoundsException e) {
+ return null;
+ }
+ return document;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
new file mode 100644
index 0000000..fc065fe
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text.wikipedia;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.net.URI;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.compress.BZip2Codec;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The Bayes example package provides some helper classes for training the Naive Bayes classifier
+ * on the Twenty Newsgroups data. See {@code PrepareTwentyNewsgroups}
+ * for details on running the trainer and
+ * formatting the Twenty Newsgroups data properly for the training.
+ *
+ * The easiest way to prepare the data is to use the ant task in core/build.xml:
+ *
+ * {@code ant extract-20news-18828}
+ *
+ * This runs the arg line:
+ *
+ * {@code -p $\{working.dir\}/20news-18828/ -o $\{working.dir\}/20news-18828-collapse -a $\{analyzer\} -c UTF-8}
+ *
+ * To Run the Wikipedia examples (assumes you've built the Mahout Job jar):
+ *
+ * <ol>
+ * <li>Download the Wikipedia Dataset. Use the Ant target: {@code ant enwiki-files}</li>
+ * <li>Chunk the data using the WikipediaXmlSplitter (from the Hadoop home):
+ * {@code bin/hadoop jar $MAHOUT_HOME/target/mahout-examples-0.x
+ * org.apache.mahout.classifier.bayes.WikipediaXmlSplitter
+ * -d $MAHOUT_HOME/examples/temp/enwiki-latest-pages-articles.xml
+ * -o $MAHOUT_HOME/examples/work/wikipedia/chunks/ -c 64}</li>
+ * </ol>
+ */
+public final class WikipediaXmlSplitter {
+
+ private static final Logger log = LoggerFactory.getLogger(WikipediaXmlSplitter.class);
+
+ private WikipediaXmlSplitter() { }
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true).withArgument(
+ abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d").create();
+
+ Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument(
+ abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output directory to place the splits in:\n"
+ + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
+ + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
+ + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
+ + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")
+
+ .withShortName("o").create();
+
+ Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false).withArgument(
+ abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 ID key")
+ .withShortName("i").create();
+ Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false).withArgument(
+ abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Amazon S3 secret key").withShortName("s").create();
+
+ Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true).withArgument(
+ abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Size of the chunk, in megabytes").withShortName("c").create();
+ Option numChunksOpt = obuilder
+ .withLongName("numChunks")
+ .withRequired(false)
+ .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The maximum number of chunks to create. If specified, program will only create a subset of the chunks")
+ .withShortName("n").create();
+ Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(
+ chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt).create();
+
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine;
+ try {
+ cmdLine = parser.parse(args);
+ } catch (OptionException e) {
+ log.error("Error while parsing options", e);
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ Configuration conf = new Configuration();
+ String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
+ String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
+
+ if (cmdLine.hasOption(s3IdOpt)) {
+ String id = (String) cmdLine.getValue(s3IdOpt);
+ conf.set("fs.s3n.awsAccessKeyId", id);
+ conf.set("fs.s3.awsAccessKeyId", id);
+ }
+ if (cmdLine.hasOption(s3SecretOpt)) {
+ String secret = (String) cmdLine.getValue(s3SecretOpt);
+ conf.set("fs.s3n.awsSecretAccessKey", secret);
+ conf.set("fs.s3.awsSecretAccessKey", secret);
+ }
+ // do not compute crc file when using local FS
+ conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
+ FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);
+
+ int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
+
+ int numChunks = Integer.MAX_VALUE;
+ if (cmdLine.hasOption(numChunksOpt)) {
+ numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
+ }
+
+ String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
+ + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+ + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
+ + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" "
+ + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
+ + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
+ + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n"
+ + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n"
+ + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n"
+ + " <namespace key=\"1\">Talk</namespace>\n"
+ + " <namespace key=\"2\">User</namespace>\n"
+ + " <namespace key=\"3\">User talk</namespace>\n"
+ + " <namespace key=\"4\">Wikipedia</namespace>\n"
+ + " <namespace key=\"5\">Wikipedia talk</namespace>\n"
+ + " <namespace key=\"6\">Image</namespace>\n"
+ + " <namespace key=\"7\">Image talk</namespace>\n"
+ + " <namespace key=\"8\">MediaWiki</namespace>\n"
+ + " <namespace key=\"9\">MediaWiki talk</namespace>\n"
+ + " <namespace key=\"10\">Template</namespace>\n"
+ + " <namespace key=\"11\">Template talk</namespace>\n"
+ + " <namespace key=\"12\">Help</namespace>\n"
+ + " <namespace key=\"13\">Help talk</namespace>\n"
+ + " <namespace key=\"14\">Category</namespace>\n"
+ + " <namespace key=\"15\">Category talk</namespace>\n"
+ + " <namespace key=\"100\">Portal</namespace>\n"
+ + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n"
+ + " </siteinfo>\n";
+
+ StringBuilder content = new StringBuilder();
+ content.append(header);
+ NumberFormat decimalFormatter = new DecimalFormat("0000");
+ File dumpFile = new File(dumpFilePath);
+
+ // If the specified path for the input file is incorrect, return immediately
+ if (!dumpFile.exists()) {
+ log.error("Input file path {} doesn't exist", dumpFilePath);
+ return;
+ }
+
+ FileLineIterator it;
+ if (dumpFilePath.endsWith(".bz2")) {
+ // default compression format from http://download.wikimedia.org
+ CompressionCodec codec = new BZip2Codec();
+ it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
+ } else {
+ // assume the user has previously de-compressed the dump file
+ it = new FileLineIterator(dumpFile);
+ }
+ int fileNumber = 0;
+ while (it.hasNext()) {
+ String thisLine = it.next();
+ if (thisLine.trim().startsWith("<page>")) {
+ boolean end = false;
+ while (!thisLine.trim().startsWith("</page>")) {
+ content.append(thisLine).append('\n');
+ if (it.hasNext()) {
+ thisLine = it.next();
+ } else {
+ end = true;
+ break;
+ }
+ }
+ content.append(thisLine).append('\n');
+
+ if (content.length() > chunkSize || end) {
+ content.append("</mediawiki>");
+ fileNumber++;
+ String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
+ try (BufferedWriter chunkWriter =
+ new BufferedWriter(new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"))) {
+ chunkWriter.write(content.toString(), 0, content.length());
+ }
+ if (fileNumber >= numChunks) {
+ break;
+ }
+ content = new StringBuilder();
+ content.append(header);
+ }
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
new file mode 100644
index 0000000..afd350f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text.wikipedia;
+
+import com.google.common.io.Closeables;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+/**
+ * Reads records that are delimited by a specific begin/end tag.
+ */
+public class XmlInputFormat extends TextInputFormat {
+
+ private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class);
+
+ public static final String START_TAG_KEY = "xmlinput.start";
+ public static final String END_TAG_KEY = "xmlinput.end";
+
+ @Override
+ public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
+ try {
+ return new XmlRecordReader((FileSplit) split, context.getConfiguration());
+ } catch (IOException ioe) {
+ log.warn("Error while creating XmlRecordReader", ioe);
+ return null;
+ }
+ }
+
+ /**
+ * XMLRecordReader class to read through a given xml document to output xml blocks as records as specified
+ * by the start tag and end tag
+ *
+ */
+ public static class XmlRecordReader extends RecordReader<LongWritable, Text> {
+
+ private final byte[] startTag;
+ private final byte[] endTag;
+ private final long start;
+ private final long end;
+ private final FSDataInputStream fsin;
+ private final DataOutputBuffer buffer = new DataOutputBuffer();
+ private LongWritable currentKey;
+ private Text currentValue;
+
+ public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
+ startTag = conf.get(START_TAG_KEY).getBytes(Charsets.UTF_8);
+ endTag = conf.get(END_TAG_KEY).getBytes(Charsets.UTF_8);
+
+ // open the file and seek to the start of the split
+ start = split.getStart();
+ end = start + split.getLength();
+ Path file = split.getPath();
+ FileSystem fs = file.getFileSystem(conf);
+ fsin = fs.open(split.getPath());
+ fsin.seek(start);
+ }
+
+ private boolean next(LongWritable key, Text value) throws IOException {
+ if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
+ try {
+ buffer.write(startTag);
+ if (readUntilMatch(endTag, true)) {
+ key.set(fsin.getPos());
+ value.set(buffer.getData(), 0, buffer.getLength());
+ return true;
+ }
+ } finally {
+ buffer.reset();
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public void close() throws IOException {
+ Closeables.close(fsin, true);
+ }
+
+ @Override
+ public float getProgress() throws IOException {
+ return (fsin.getPos() - start) / (float) (end - start);
+ }
+
+ private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
+ int i = 0;
+ while (true) {
+ int b = fsin.read();
+ // end of file:
+ if (b == -1) {
+ return false;
+ }
+ // save to buffer:
+ if (withinBlock) {
+ buffer.write(b);
+ }
+
+ // check if we're matching:
+ if (b == match[i]) {
+ i++;
+ if (i >= match.length) {
+ return true;
+ }
+ } else {
+ i = 0;
+ }
+ // see if we've passed the stop point:
+ if (!withinBlock && i == 0 && fsin.getPos() >= end) {
+ return false;
+ }
+ }
+ }
+
+ @Override
+ public LongWritable getCurrentKey() throws IOException, InterruptedException {
+ return currentKey;
+ }
+
+ @Override
+ public Text getCurrentValue() throws IOException, InterruptedException {
+ return currentValue;
+ }
+
+ @Override
+ public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException, InterruptedException {
+ currentKey = new LongWritable();
+ currentValue = new Text();
+ return next(currentKey, currentValue);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/Bump125.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/Bump125.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/Bump125.java
new file mode 100644
index 0000000..1c55090
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/Bump125.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+/**
+ * Helps with making nice intervals at arbitrary scale.
+ *
+ * One use case is where we are producing progress or error messages every time an incoming
+ * record is received. It is generally bad form to produce a message for every input
+ * so it would be better to produce a message for each of the first 10 records, then every
+ * other record up to 20 and then every 5 records up to 50 and then every 10 records up to 100,
+ * more or less. The pattern can now repeat scaled up by 100. The total number of messages will scale
+ * with the log of the number of input lines which is much more survivable than direct output
+ * and because early records all get messages, we get indications early.
+ */
+public class Bump125 {
+ private static final int[] BUMPS = {1, 2, 5};
+
+ static int scale(double value, double base) {
+ double scale = value / base;
+ // scan for correct step
+ int i = 0;
+ while (i < BUMPS.length - 1 && BUMPS[i + 1] <= scale) {
+ i++;
+ }
+ return BUMPS[i];
+ }
+
+ static long base(double value) {
+ return Math.max(1, (long) Math.pow(10, (int) Math.floor(Math.log10(value))));
+ }
+
+ private long counter = 0;
+
+ public long increment() {
+ long delta;
+ if (counter >= 10) {
+ long base = base(counter / 4.0);
+ int scale = scale(counter / 4.0, base);
+ delta = base * scale;
+ } else {
+ delta = 1;
+ }
+ counter += delta;
+ return counter;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
new file mode 100644
index 0000000..f63de83
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+
+/**
+ * Export a Matrix in various text formats:
+ * * CSV file
+ *
+ * Input format: Hadoop SequenceFile with Text key and MatrixWritable value, 1 pair
+ * TODO:
+ * Needs class for key value- should not hard-code to Text.
+ * Options for row and column headers- stats software can be picky.
+ * Assumes only one matrix in a file.
+ */
+public final class MatrixDumper extends AbstractJob {
+
+ private MatrixDumper() { }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new MatrixDumper(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOption("output", "o", "Output path", null); // AbstractJob output feature requires param
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+ String outputFile = hasOption("output") ? getOption("output") : null;
+ exportCSV(getInputPath(), outputFile, false);
+ return 0;
+ }
+
+ private static void exportCSV(Path inputPath, String outputFile, boolean doLabels) throws IOException {
+ SequenceFileValueIterator<MatrixWritable> it =
+ new SequenceFileValueIterator<>(inputPath, true, new Configuration());
+ Matrix m = it.next().get();
+ it.close();
+ PrintStream ps = getPrintStream(outputFile);
+ String[] columnLabels = getLabels(m.numCols(), m.getColumnLabelBindings(), "col");
+ String[] rowLabels = getLabels(m.numRows(), m.getRowLabelBindings(), "row");
+ if (doLabels) {
+ ps.print("rowid,");
+ ps.print(columnLabels[0]);
+ for (int c = 1; c < m.numCols(); c++) {
+ ps.print(',' + columnLabels[c]);
+ }
+ ps.println();
+ }
+ for (int r = 0; r < m.numRows(); r++) {
+ if (doLabels) {
+ ps.print(rowLabels[0] + ',');
+ }
+ ps.print(Double.toString(m.getQuick(r,0)));
+ for (int c = 1; c < m.numCols(); c++) {
+ ps.print(",");
+ ps.print(Double.toString(m.getQuick(r,c)));
+ }
+ ps.println();
+ }
+ if (ps != System.out) {
+ ps.close();
+ }
+ }
+
+ private static PrintStream getPrintStream(String outputPath) throws IOException {
+ if (outputPath == null) {
+ return System.out;
+ }
+ File outputFile = new File(outputPath);
+ if (outputFile.exists()) {
+ outputFile.delete();
+ }
+ outputFile.createNewFile();
+ OutputStream os = new FileOutputStream(outputFile);
+ return new PrintStream(os, false, Charsets.UTF_8.displayName());
+ }
+
+ /**
+ * return the label set, sorted by matrix order
+ * if there are no labels, fabricate them using the starter string
+ * @param length
+ */
+ private static String[] getLabels(int length, Map<String,Integer> labels, String start) {
+ if (labels != null) {
+ return sortLabels(labels);
+ }
+ String[] sorted = new String[length];
+ for (int i = 1; i <= length; i++) {
+ sorted[i] = start + i;
+ }
+ return sorted;
+ }
+
+ private static String[] sortLabels(Map<String,Integer> labels) {
+ String[] sorted = new String[labels.size()];
+ for (Map.Entry<String,Integer> entry : labels.entrySet()) {
+ sorted[entry.getValue()] = entry.getKey();
+ }
+ return sorted;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
new file mode 100644
index 0000000..e01868a
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.io.File;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.apache.mahout.math.list.IntArrayList;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+public final class SequenceFileDumper extends AbstractJob {
+
+ public SequenceFileDumper() {
+ setConf(new Configuration());
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption("substring", "b", "The number of chars to print out per value", false);
+ addOption(buildOption("count", "c", "Report the count only", false, false, null));
+ addOption("numItems", "n", "Output at most <n> key value pairs", false);
+ addOption(buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, "
+ + "this can take up a fair amount of memory", false, false, null));
+ addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));
+
+ if (parseArguments(args, false, true) == null) {
+ return -1;
+ }
+
+ Path[] pathArr;
+ Configuration conf = new Configuration();
+ Path input = getInputPath();
+ FileSystem fs = input.getFileSystem(conf);
+ if (fs.getFileStatus(input).isDir()) {
+ pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
+ } else {
+ pathArr = new Path[1];
+ pathArr[0] = input;
+ }
+
+
+ Writer writer;
+ boolean shouldClose;
+ if (hasOption("output")) {
+ shouldClose = true;
+ writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
+ } else {
+ shouldClose = false;
+ writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
+ }
+ try {
+ for (Path path : pathArr) {
+ if (!hasOption("quiet")) {
+ writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
+ }
+
+ int sub = Integer.MAX_VALUE;
+ if (hasOption("substring")) {
+ sub = Integer.parseInt(getOption("substring"));
+ }
+ boolean countOnly = hasOption("count");
+ SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<>(path, true, conf);
+ if (!hasOption("quiet")) {
+ writer.append("Key class: ").append(iterator.getKeyClass().toString());
+ writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
+ }
+ OpenObjectIntHashMap<String> facets = null;
+ if (hasOption("facets")) {
+ facets = new OpenObjectIntHashMap<>();
+ }
+ long count = 0;
+ if (countOnly) {
+ while (iterator.hasNext()) {
+ Pair<?, ?> record = iterator.next();
+ String key = record.getFirst().toString();
+ if (facets != null) {
+ facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
+ }
+ count++;
+ }
+ writer.append("Count: ").append(String.valueOf(count)).append('\n');
+ } else {
+ long numItems = Long.MAX_VALUE;
+ if (hasOption("numItems")) {
+ numItems = Long.parseLong(getOption("numItems"));
+ if (!hasOption("quiet")) {
+ writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
+ }
+ }
+ while (iterator.hasNext() && count < numItems) {
+ Pair<?, ?> record = iterator.next();
+ String key = record.getFirst().toString();
+ writer.append("Key: ").append(key);
+ String str = record.getSecond().toString();
+ writer.append(": Value: ").append(str.length() > sub
+ ? str.substring(0, sub) : str);
+ writer.write('\n');
+ if (facets != null) {
+ facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
+ }
+ count++;
+ }
+ if (!hasOption("quiet")) {
+ writer.append("Count: ").append(String.valueOf(count)).append('\n');
+ }
+ }
+ if (facets != null) {
+ List<String> keyList = new ArrayList<>(facets.size());
+
+ IntArrayList valueList = new IntArrayList(facets.size());
+ facets.pairsSortedByKey(keyList, valueList);
+ writer.append("-----Facets---\n");
+ writer.append("Key\t\tCount\n");
+ int i = 0;
+ for (String key : keyList) {
+ writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
+ }
+ }
+ }
+ writer.flush();
+
+ } finally {
+ if (shouldClose) {
+ Closeables.close(writer, false);
+ }
+ }
+
+
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ new SequenceFileDumper().run(args);
+ }
+
+}

r***@apache.org

2018-06-27 14:52:18 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/ssvd.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/ssvd.props b/community/mahout-mr/conf/ssvd.props
new file mode 100644
index 0000000..26a52c7
--- /dev/null
+++ b/community/mahout-mr/conf/ssvd.props
@@ -0,0 +1,14 @@
+#i|input =
+#o|output =
+#k|rank =
+#t|tempDir =
+#p|oversampling =
+#r|blockHeight =
+#s|minSplitSize =
+#U|computeU =
+#uhs|uHalfSigma =
+#V|computeV =
+#vhs|vHalfSigma =
+#t|reduceTasks =
+#w|wide =
+#q|powerIter =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/svd.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/svd.props b/community/mahout-mr/conf/svd.props
new file mode 100644
index 0000000..8c9a467
--- /dev/null
+++ b/community/mahout-mr/conf/svd.props
@@ -0,0 +1,6 @@
+#i|input =
+#o|output =
+#nr|numRows =
+#nc|numCols =
+#r|rank =
+#t|tempDir =
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/trainlogistic.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/trainlogistic.props b/community/mahout-mr/conf/trainlogistic.props
new file mode 100644
index 0000000..f474942
--- /dev/null
+++ b/community/mahout-mr/conf/trainlogistic.props
@@ -0,0 +1,2 @@
+#lambda|lambda =
+#passes|passes =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/transpose.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/transpose.props b/community/mahout-mr/conf/transpose.props
new file mode 100644
index 0000000..025f945
--- /dev/null
+++ b/community/mahout-mr/conf/transpose.props
@@ -0,0 +1,2 @@
+#i|input =
+#o|output =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/vectordump.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/vectordump.props b/community/mahout-mr/conf/vectordump.props
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/community/mahout-mr/conf/vectordump.props
@@ -0,0 +1 @@
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh b/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh
new file mode 100755
index 0000000..77f5d13
--- /dev/null
+++ b/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# Performs the setup procedures for clustering the ASF mail archives
+# described in Taming Text.
+#
+# Required Command-line Parameters:
+#
+# $1 - Path to this script's working directory, you will need about
+# 22GB of free space to run this script.
+#
+# $2 - Path to where the ASF Public Archive data is, untarred.
+# If you are running Hadoop and the files are in HDFS, then
+# this will need to be an HDFS path. Default is $1/input
+# $3 - Path to where this script saves the SequenceFile output.
+# If you are running Hadoop and you want the sequence files
+# saved to your HDFS then you need to set this value to an
+# HDFS path and make sure you set HADOOP_HOME so Mahout can
+# find Hadoop. Default is $1/sequence-files
+#
+#
+# Required Environment Variables:
+#
+# MAHOUT_HOME
+# Root directory of your Mahout distribution
+#
+# HADOOP_HOME
+# Only needed if you want to send output to HDFS
+#
+# Example:
+# ./prep_asf_mail_archives.sh /mnt/asf-mail-archives /mnt/asf-archives/asf-mail-archives-7-18-2011 /mnt/asf-mail-archives/output
+#
+# This will download the TAR files from S3, extract them, and then
+# run the Mahout org.apache.mahout.text.SequenceFilesFromMailArchives job
+# to create Hadoop SequenceFiles in /mnt/asf-mail-archives/output
+#
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+if [ "$MAHOUT_HOME" = "" ]; then
+ echo "Error: MAHOUT_HOME is not set."
+ exit 1
+fi
+
+if [ "$1" = "" ]; then
+ echo "Error: Please pass the path to your prep directory, such as /mnt/asf-mail-archives.\n\n\tUsage: $0 workingDir inputPath outputPath\n"
+ exit 1
+fi
+
+# Location where this script saves files
+PREP_DIR=$1
+
+if [ "$2" != "" ]; then
+ SEQFILE_INPUT_DIR=$2
+else
+ SEQFILE_INPUT_DIR=$PREP_DIR/input
+fi
+
+
+# Change this to an HDFS path if you are running Hadoop
+if [ "$3" != "" ]; then
+ SEQFILE_OUTPUT_DIR=$3
+else
+ SEQFILE_OUTPUT_DIR=$PREP_DIR/sequence-files
+fi
+
+# If output sent to HDFS, clear MAHOUT_LOCAL and make sure HADOOP_HOME is set
+if [[ "$SEQFILE_OUTPUT_DIR" = hdfs://* ]]; then
+ export MAHOUT_LOCAL=
+ if [ "$HADOOP_HOME" = "" ]; then
+ echo "Error: HADOOP_HOME must be set if you want to send output to HDFS."
+ exit 1
+ fi
+else
+ export MAHOUT_LOCAL=$PREP_DIR
+fi
+
+echo "Running $0 with:
+ PREP_DIR = $PREP_DIR
+ SEQFILE_INPUT_DIR = $SEQFILE_INPUT_DIR
+ SEQFILE_OUTPUT_DIR = $SEQFILE_OUTPUT_DIR
+ MAHOUT_LOCAL = $MAHOUT_LOCAL
+ HADOOP_HOME = $HADOOP_HOME"
+
+# Run Mahout in Local mode! Remove this if you want the
+# sequence files stored in your HDFS
+
+
+# convert the extracted gz files into Hadoop SequenceFiles
+echo "Converting extracted directories to SequenceFiles ..."
+$MAHOUT_HOME/bin/mahout org.apache.mahout.text.SequenceFilesFromMailArchives \
+--input $SEQFILE_INPUT_DIR --output $SEQFILE_OUTPUT_DIR --subject --body \
+-c UTF-8 -chunk 1024 -prefix asf_archives

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/pom.xml b/community/mahout-mr/integration/pom.xml
new file mode 100644
index 0000000..cb0c19a
--- /dev/null
+++ b/community/mahout-mr/integration/pom.xml
@@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout</artifactId>
+ <version>0.13.1-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>mahout-integration</artifactId>
+ <name>Mahout Integration</name>
+ <description>Optional components of Mahout which generally support interaction with third party systems,
+ formats, APIs, etc.</description>
+
+ <packaging>jar</packaging>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-remote-resources-plugin</artifactId>
+ <configuration>
+ <appendedResourcesDirectory>../community/mahout-mr/src/appended-resources</appendedResourcesDirectory>
+ <resourceBundles>
+ <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
+ </resourceBundles>
+ <supplementalModels>
+ <supplementalModel>supplemental-models.xml</supplementalModel>
+ </supplementalModels>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-javadoc-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-source-plugin</artifactId>
+ </plugin>
+
+ </plugins>
+
+ </build>
+
+ <dependencies>
+
+ 
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-hdfs</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-mr</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-hdfs</artifactId>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-mr</artifactId>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-math</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mahout-math</artifactId>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+
+ 
+
+ <dependency>
+ <groupId>commons-dbcp</groupId>
+ <artifactId>commons-dbcp</artifactId>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-pool</groupId>
+ <artifactId>commons-pool</artifactId>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-commons-csv</artifactId>
+ <version>3.5.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-benchmark</artifactId>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mongodb</groupId>
+ <artifactId>mongo-java-driver</artifactId>
+ <version>2.11.2</version>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mongodb</groupId>
+ <artifactId>bson</artifactId>
+ <version>2.11.2</version>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>hbase-client</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.hectorclient</groupId>
+ <artifactId>hector-core</artifactId>
+ <version>1.1-4</version>
+ <optional>true</optional>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jcl</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>com.carrotsearch.randomizedtesting</groupId>
+ <artifactId>randomizedtesting-runner</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.easymock</groupId>
+ <artifactId>easymock</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
new file mode 100644
index 0000000..549cf2c
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.base.Function;
+
+public final class BenchmarkRunner {
+ private static final int BUCKET_SIZE = 10000;
+ private static final Random R = RandomUtils.getRandom();
+ private final long maxTimeUsec;
+ private final long leadTimeUsec;
+
+ public BenchmarkRunner(long leadTimeMs, long maxTimeMs) {
+ maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(maxTimeMs);
+ leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(leadTimeMs);
+ }
+
+ public abstract static class BenchmarkFn implements Function<Integer, Boolean> {
+ protected int randIndex() {
+ return BenchmarkRunner.randIndex();
+ }
+
+ protected boolean randBool() {
+ return BenchmarkRunner.randBool();
+ }
+
+ /**
+ * Adds a random data dependency so that JVM does not remove dead code.
+ */
+ protected boolean depends(Vector v) {
+ return randIndex() < v.getNumNondefaultElements();
+ }
+ }
+
+ public abstract static class BenchmarkFnD implements Function<Integer, Double> {
+ protected int randIndex() {
+ return BenchmarkRunner.randIndex();
+ }
+
+ protected boolean randBool() {
+ return BenchmarkRunner.randBool();
+ }
+
+ /**
+ * Adds a random data dependency so that JVM does not remove dead code.
+ */
+ protected boolean depends(Vector v) {
+ return randIndex() < v.getNumNondefaultElements();
+ }
+ }
+
+ private static int randIndex() {
+ return R.nextInt(BUCKET_SIZE);
+ }
+
+ private static boolean randBool() {
+ return R.nextBoolean();
+ }
+
+ public TimingStatistics benchmark(BenchmarkFn function) {
+ TimingStatistics stats = new TimingStatistics();
+ boolean result = false;
+ while (true) {
+ int i = R.nextInt(BUCKET_SIZE);
+ TimingStatistics.Call call = stats.newCall(leadTimeUsec);
+ result = result ^ function.apply(i);
+ if (call.end(maxTimeUsec)) {
+ break;
+ }
+ }
+ return stats;
+ }
+
+ public TimingStatistics benchmarkD(BenchmarkFnD function) {
+ TimingStatistics stats = new TimingStatistics();
+ double result = 0;
+ while (true) {
+ int i = R.nextInt(BUCKET_SIZE);
+ TimingStatistics.Call call = stats.newCall(leadTimeUsec);
+ result += function.apply(i);
+ if (call.end(maxTimeUsec)) {
+ break;
+ }
+ }
+ // print result to prevent hotspot from eliminating deadcode
+ System.err.println("Result = " + result);
+ return stats;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
new file mode 100644
index 0000000..5e6ab4d
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+
+public class CloneBenchmark {
+ public static final String CLONE = "Clone";
+ private final VectorBenchmarks mark;
+
+ public CloneBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ mark.vectors[0][mark.vIndex(i)] = mark.vectors[0][mark.vIndex(i)].clone();
+
+ return depends(mark.vectors[0][mark.vIndex(i)]);
+ }
+ }), CLONE, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ mark.vectors[1][mark.vIndex(i)] = mark.vectors[1][mark.vIndex(i)].clone();
+
+ return depends(mark.vectors[1][mark.vIndex(i)]);
+ }
+ }), CLONE, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ mark.vectors[2][mark.vIndex(i)] = mark.vectors[2][mark.vIndex(i)].clone();
+
+ return depends(mark.vectors[2][mark.vIndex(i)]);
+ }
+ }), CLONE, SEQ_SPARSE_VECTOR);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
new file mode 100644
index 0000000..b1c2ded
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.SparseMatrix;
+import org.apache.mahout.math.Vector;
+
+public class ClosestCentroidBenchmark {
+ private final VectorBenchmarks mark;
+
+ public ClosestCentroidBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark(DistanceMeasure measure) throws IOException {
+ SparseMatrix clusterDistances = new SparseMatrix(mark.numClusters, mark.numClusters);
+ for (int i = 0; i < mark.numClusters; i++) {
+ for (int j = 0; j < mark.numClusters; j++) {
+ double distance = Double.POSITIVE_INFINITY;
+ if (i != j) {
+ distance = measure.distance(mark.clusters[i], mark.clusters[j]);
+ }
+ clusterDistances.setQuick(i, j, distance);
+ }
+ }
+
+ long distanceCalculations = 0;
+ TimingStatistics stats = new TimingStatistics();
+ for (int l = 0; l < mark.loop; l++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ for (int i = 0; i < mark.numVectors; i++) {
+ Vector vector = mark.vectors[1][mark.vIndex(i)];
+ double minDistance = Double.MAX_VALUE;
+ for (int k = 0; k < mark.numClusters; k++) {
+ double distance = measure.distance(vector, mark.clusters[k]);
+ distanceCalculations++;
+ if (distance < minDistance) {
+ minDistance = distance;
+ }
+ }
+ }
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ mark.printStats(stats, measure.getClass().getName(), "Closest C w/o Elkan's trick", "distanceCalculations = "
+ + distanceCalculations);
+
+ distanceCalculations = 0;
+ stats = new TimingStatistics();
+ Random rand = RandomUtils.getRandom();
+ for (int l = 0; l < mark.loop; l++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ for (int i = 0; i < mark.numVectors; i++) {
+ Vector vector = mark.vectors[1][mark.vIndex(i)];
+ int closestCentroid = rand.nextInt(mark.numClusters);
+ double dist = measure.distance(vector, mark.clusters[closestCentroid]);
+ distanceCalculations++;
+ for (int k = 0; k < mark.numClusters; k++) {
+ if (closestCentroid != k) {
+ double centroidDist = clusterDistances.getQuick(k, closestCentroid);
+ if (centroidDist < 2 * dist) {
+ dist = measure.distance(vector, mark.clusters[k]);
+ closestCentroid = k;
+ distanceCalculations++;
+ }
+ }
+ }
+ }
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ mark.printStats(stats, measure.getClass().getName(), "Closest C w/ Elkan's trick", "distanceCalculations = "
+ + distanceCalculations);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
new file mode 100644
index 0000000..25d0ad7
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD;
+import org.apache.mahout.common.distance.DistanceMeasure;
+
+public class DistanceBenchmark {
+ private final VectorBenchmarks mark;
+
+ public DistanceBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark(final DistanceMeasure measure) {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), measure.getClass().getName(), SEQ_FN_RAND);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
new file mode 100644
index 0000000..fc7f911
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD;
+
+public class DotBenchmark {
+ private static final String DOT_PRODUCT = "DotProduct";
+ private static final String NORM1 = "Norm1";
+ private static final String NORM2 = "Norm2";
+ private static final String LOG_NORMALIZE = "LogNormalize";
+ private final VectorBenchmarks mark;
+
+ public DotBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ benchmarkDot();
+ benchmarkNorm1();
+ benchmarkNorm2();
+ benchmarkLogNormalize();
+ }
+
+ private void benchmarkLogNormalize() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ return depends(mark.vectors[0][mark.vIndex(i)].logNormalize());
+ }
+ }), LOG_NORMALIZE, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ return depends(mark.vectors[1][mark.vIndex(i)].logNormalize());
+ }
+ }), LOG_NORMALIZE, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ return depends(mark.vectors[2][mark.vIndex(i)].logNormalize());
+ }
+ }), LOG_NORMALIZE, SEQ_SPARSE_VECTOR);
+ }
+
+ private void benchmarkNorm1() {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].norm(1);
+ }
+ }), NORM1, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].norm(1);
+ }
+ }), NORM1, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].norm(1);
+ }
+ }), NORM1, SEQ_SPARSE_VECTOR);
+ }
+
+ private void benchmarkNorm2() {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].norm(2);
+ }
+ }), NORM2, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].norm(2);
+ }
+ }), NORM2, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].norm(2);
+ }
+ }), NORM2, SEQ_SPARSE_VECTOR);
+ }
+
+ private void benchmarkDot() {
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() {
+ @Override
+ public Double apply(Integer i) {
+ return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]);
+ }
+ }), DOT_PRODUCT, SEQ_FN_RAND);
+ }
+
+ public static void main(String[] args) {
+ VectorBenchmarks mark = new VectorBenchmarks(1000000, 100, 1000, 10, 1);
+ mark.createData();
+ new DotBenchmark(mark).benchmarkNorm2();
+ System.out.println(mark);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
new file mode 100644
index 0000000..82fb693
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.math.Vector;
+
+public class MinusBenchmark {
+
+ private static final String MINUS = "Minus";
+ private final VectorBenchmarks mark;
+
+ public MinusBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), MINUS, SEQ_FN_RAND);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
new file mode 100644
index 0000000..bd76e94
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.math.Vector;
+
+public class PlusBenchmark {
+
+ private static final String PLUS = "Plus";
+ private final VectorBenchmarks mark;
+
+ public PlusBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), PLUS, SEQ_FN_RAND);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
new file mode 100644
index 0000000..cd403c2
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+public class SerializationBenchmark {
+ public static final String SERIALIZE = "Serialize";
+ public static final String DESERIALIZE = "Deserialize";
+ private final VectorBenchmarks mark;
+
+ public SerializationBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() throws IOException {
+ serializeBenchmark();
+ deserializeBenchmark();
+ }
+
+ public void serializeBenchmark() throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+
+ Writable one = new IntWritable(0);
+ VectorWritable vec = new VectorWritable();
+ TimingStatistics stats = new TimingStatistics();
+
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf, new Path("/tmp/dense-vector"),
+ IntWritable.class, VectorWritable.class)){
+ for (int i = 0; i < mark.loop; i++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ vec.set(mark.vectors[0][mark.vIndex(i)]);
+ writer.append(one, vec);
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ }
+ mark.printStats(stats, SERIALIZE, DENSE_VECTOR);
+
+ stats = new TimingStatistics();
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf,
+ new Path("/tmp/randsparse-vector"), IntWritable.class, VectorWritable.class)){
+ for (int i = 0; i < mark.loop; i++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ vec.set(mark.vectors[1][mark.vIndex(i)]);
+ writer.append(one, vec);
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ }
+ mark.printStats(stats, SERIALIZE, RAND_SPARSE_VECTOR);
+
+ stats = new TimingStatistics();
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, conf,
+ new Path("/tmp/seqsparse-vector"), IntWritable.class, VectorWritable.class)) {
+ for (int i = 0; i < mark.loop; i++) {
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ vec.set(mark.vectors[2][mark.vIndex(i)]);
+ writer.append(one, vec);
+ if (call.end(mark.maxTimeUsec)) {
+ break;
+ }
+ }
+ }
+ mark.printStats(stats, SERIALIZE, SEQ_SPARSE_VECTOR);
+
+ }
+
+ public void deserializeBenchmark() throws IOException {
+ doDeserializeBenchmark(DENSE_VECTOR, "/tmp/dense-vector");
+ doDeserializeBenchmark(RAND_SPARSE_VECTOR, "/tmp/randsparse-vector");
+ doDeserializeBenchmark(SEQ_SPARSE_VECTOR, "/tmp/seqsparse-vector");
+ }
+
+ private void doDeserializeBenchmark(String name, String pathString) throws IOException {
+ TimingStatistics stats = new TimingStatistics();
+ TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec);
+ SequenceFileValueIterator<Writable> iterator = new SequenceFileValueIterator<>(new Path(pathString), true,
+ new Configuration());
+ while (iterator.hasNext()) {
+ iterator.next();
+ call.end();
+ call = stats.newCall(mark.leadTimeUsec);
+ }
+ iterator.close();
+ mark.printStats(stats, DESERIALIZE, name);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
new file mode 100644
index 0000000..bf81228
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ;
+import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND;
+import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR;
+
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.math.Vector;
+
+public class TimesBenchmark {
+
+ private static final String TIMES = "Times";
+ private final VectorBenchmarks mark;
+
+ public TimesBenchmark(VectorBenchmarks mark) {
+ this.mark = mark;
+ }
+
+ public void benchmark() {
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, DENSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, RAND_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, SEQ_SPARSE_VECTOR);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, DENSE_FN_RAND);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, DENSE_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, RAND_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, RAND_FN_SEQ);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, SEQ_FN_DENSE);
+
+ mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]);
+ return depends(v);
+ }
+ }), TIMES, SEQ_FN_RAND);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
new file mode 100644
index 0000000..a076322
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java
@@ -0,0 +1,497 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.benchmark;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.TimingStatistics;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.ChebyshevDistanceMeasure;
+import org.apache.mahout.common.distance.CosineDistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.common.distance.MinkowskiDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.TanimotoDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Pattern;
+
+public class VectorBenchmarks {
+ private static final int MAX_TIME_MS = 5000;
+ private static final int LEAD_TIME_MS = 15000;
+ public static final String CLUSTERS = "Clusters";
+ public static final String CREATE_INCREMENTALLY = "Create (incrementally)";
+ public static final String CREATE_COPY = "Create (copy)";
+
+ public static final String DENSE_FN_SEQ = "Dense.fn(Seq)";
+ public static final String RAND_FN_DENSE = "Rand.fn(Dense)";
+ public static final String SEQ_FN_RAND = "Seq.fn(Rand)";
+ public static final String RAND_FN_SEQ = "Rand.fn(Seq)";
+ public static final String SEQ_FN_DENSE = "Seq.fn(Dense)";
+ public static final String DENSE_FN_RAND = "Dense.fn(Rand)";
+ public static final String SEQ_SPARSE_VECTOR = "SeqSparseVector";
+ public static final String RAND_SPARSE_VECTOR = "RandSparseVector";
+ public static final String DENSE_VECTOR = "DenseVector";
+
+ private static final Logger log = LoggerFactory.getLogger(VectorBenchmarks.class);
+ private static final Pattern TAB_NEWLINE_PATTERN = Pattern.compile("[\n\t]");
+ private static final String[] EMPTY = new String[0];
+ private static final DecimalFormat DF = new DecimalFormat("#.##");
+
+ /* package private */
+ final Vector[][] vectors;
+ final Vector[] clusters;
+ final int cardinality;
+ final int numNonZeros;
+ final int numVectors;
+ final int numClusters;
+ final int loop = Integer.MAX_VALUE;
+ final int opsPerUnit;
+ final long maxTimeUsec;
+ final long leadTimeUsec;
+
+ private final List<Vector> randomVectors = new ArrayList<>();
+ private final List<int[]> randomVectorIndices = new ArrayList<>();
+ private final List<double[]> randomVectorValues = new ArrayList<>();
+ private final Map<String, Integer> implType = new HashMap<>();
+ private final Map<String, List<String[]>> statsMap = new HashMap<>();
+ private final BenchmarkRunner runner;
+ private final Random r = RandomUtils.getRandom();
+
+ public VectorBenchmarks(int cardinality, int numNonZeros, int numVectors, int numClusters,
+ int opsPerUnit) {
+ runner = new BenchmarkRunner(LEAD_TIME_MS, MAX_TIME_MS);
+ maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(MAX_TIME_MS);
+ leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(LEAD_TIME_MS);
+
+ this.cardinality = cardinality;
+ this.numNonZeros = numNonZeros;
+ this.numVectors = numVectors;
+ this.numClusters = numClusters;
+ this.opsPerUnit = opsPerUnit;
+
+ setUpVectors(cardinality, numNonZeros, numVectors);
+
+ vectors = new Vector[3][numVectors];
+ clusters = new Vector[numClusters];
+ }
+
+ private void setUpVectors(int cardinality, int numNonZeros, int numVectors) {
+ for (int i = 0; i < numVectors; i++) {
+ Vector v = new SequentialAccessSparseVector(cardinality, numNonZeros); // sparsity!
+ BitSet featureSpace = new BitSet(cardinality);
+ int[] indexes = new int[numNonZeros];
+ double[] values = new double[numNonZeros];
+ int j = 0;
+ while (j < numNonZeros) {
+ double value = r.nextGaussian();
+ int index = r.nextInt(cardinality);
+ if (!featureSpace.get(index) && value != 0) {
+ featureSpace.set(index);
+ indexes[j] = index;
+ values[j++] = value;
+ v.set(index, value);
+ }
+ }
+ randomVectorIndices.add(indexes);
+ randomVectorValues.add(values);
+ randomVectors.add(v);
+ }
+ }
+
+ void printStats(TimingStatistics stats, String benchmarkName, String implName, String content) {
+ printStats(stats, benchmarkName, implName, content, 1);
+ }
+
+ void printStats(TimingStatistics stats, String benchmarkName, String implName) {
+ printStats(stats, benchmarkName, implName, "", 1);
+ }
+
+ private void printStats(TimingStatistics stats, String benchmarkName, String implName,
+ String content, int multiplier) {
+ float speed = multiplier * stats.getNCalls() * (numNonZeros * 1000.0f * 12 / stats.getSumTime());
+ float opsPerSec = stats.getNCalls() * 1000000000.0f / stats.getSumTime();
+ log.info("{} {} \n{} {} \nOps = {} Units/sec\nIOps = {} MBytes/sec", benchmarkName,
+ implName, content, stats.toString(), DF.format(opsPerSec), DF.format(speed));
+
+ if (!implType.containsKey(implName)) {
+ implType.put(implName, implType.size());
+ }
+ int implId = implType.get(implName);
+ if (!statsMap.containsKey(benchmarkName)) {
+ statsMap.put(benchmarkName, new ArrayList<String[]>());
+ }
+ List<String[]> implStats = statsMap.get(benchmarkName);
+ while (implStats.size() < implId + 1) {
+ implStats.add(EMPTY);
+ }
+ implStats.set(
+ implId,
+ TAB_NEWLINE_PATTERN.split(stats + "\tSpeed = " + DF.format(opsPerSec) + " /sec\tRate = "
+ + DF.format(speed) + " MB/s"));
+ }
+
+ public void createData() {
+ for (int i = 0; i < Math.max(numVectors, numClusters); ++i) {
+ vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i)));
+ vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i)));
+ if (numClusters > 0) {
+ clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ }
+ }
+ }
+
+ public void createBenchmark() {
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i)));
+ return depends(vectors[0][vIndex(i)]);
+ }
+ }), CREATE_COPY, DENSE_VECTOR);
+
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ return depends(vectors[1][vIndex(i)]);
+ }
+ }), CREATE_COPY, RAND_SPARSE_VECTOR);
+
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i)));
+ return depends(vectors[2][vIndex(i)]);
+ }
+ }), CREATE_COPY, SEQ_SPARSE_VECTOR);
+
+ if (numClusters > 0) {
+ printStats(runner.benchmark(new BenchmarkFn() {
+ @Override
+ public Boolean apply(Integer i) {
+ clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i)));
+ return depends(clusters[cIndex(i)]);
+ }
+ }), CREATE_COPY, CLUSTERS);
+ }
+ }
+
+ private boolean buildVectorIncrementally(TimingStatistics stats, int randomIndex, Vector v, boolean useSetQuick) {
+ int[] indexes = randomVectorIndices.get(randomIndex);
+ double[] values = randomVectorValues.get(randomIndex);
+ List<Integer> randomOrder = new ArrayList<>();
+ for (int i = 0; i < indexes.length; i++) {
+ randomOrder.add(i);
+ }
+ Collections.shuffle(randomOrder);
+ int[] permutation = new int[randomOrder.size()];
+ for (int i = 0; i < randomOrder.size(); i++) {
+ permutation[i] = randomOrder.get(i);
+ }
+
+ TimingStatistics.Call call = stats.newCall(leadTimeUsec);
+ if (useSetQuick) {
+ for (int i : permutation) {
+ v.setQuick(indexes[i], values[i]);
+ }
+ } else {
+ for (int i : permutation) {
+ v.set(indexes[i], values[i]);
+ }
+ }
+ return call.end(maxTimeUsec);
+ }
+
+ public void incrementalCreateBenchmark() {
+ TimingStatistics stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ vectors[0][vIndex(i)] = new DenseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), vectors[0][vIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, DENSE_VECTOR);
+
+ stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ vectors[1][vIndex(i)] = new RandomAccessSparseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), vectors[1][vIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, RAND_SPARSE_VECTOR);
+
+ stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ vectors[2][vIndex(i)] = new SequentialAccessSparseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), vectors[2][vIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, SEQ_SPARSE_VECTOR);
+
+ if (numClusters > 0) {
+ stats = new TimingStatistics();
+ for (int i = 0; i < loop; i++) {
+ clusters[cIndex(i)] = new RandomAccessSparseVector(cardinality);
+ if (buildVectorIncrementally(stats, vIndex(i), clusters[cIndex(i)], false)) {
+ break;
+ }
+ }
+ printStats(stats, CREATE_INCREMENTALLY, CLUSTERS);
+ }
+ }
+
+ public int vIndex(int i) {
+ return i % numVectors;
+ }
+
+ public int cIndex(int i) {
+ return i % numClusters;
+ }
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option vectorSizeOpt = obuilder
+ .withLongName("vectorSize")
+ .withRequired(false)
+ .withArgument(abuilder.withName("vs").withDefault(1000000).create())
+ .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create();
+ Option numNonZeroOpt = obuilder
+ .withLongName("numNonZero")
+ .withRequired(false)
+ .withArgument(abuilder.withName("nz").withDefault(1000).create())
+ .withDescription("Size of the vector. Default: 1000").withShortName("nz").create();
+ Option numVectorsOpt = obuilder
+ .withLongName("numVectors")
+ .withRequired(false)
+ .withArgument(abuilder.withName("nv").withDefault(25).create())
+ .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create();
+ Option numClustersOpt = obuilder
+ .withLongName("numClusters")
+ .withRequired(false)
+ .withArgument(abuilder.withName("nc").withDefault(0).create())
+ .withDescription("Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0")
+ .withShortName("nc").create();
+ Option numOpsOpt = obuilder
+ .withLongName("numOps")
+ .withRequired(false)
+ .withArgument(abuilder.withName("numOps").withDefault(10).create())
+ .withDescription(
+ "Number of operations to do per timer. "
+ + "E.g In distance measure, the distance is calculated numOps times"
+ + " and the total time is measured. Default: 10").withShortName("no").create();
+
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt)
+ .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelpWithGenericOptions(group);
+ return;
+ }
+
+ int cardinality = 1000000;
+ if (cmdLine.hasOption(vectorSizeOpt)) {
+ cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt));
+
+ }
+
+ int numClusters = 0;
+ if (cmdLine.hasOption(numClustersOpt)) {
+ numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt));
+ }
+
+ int numNonZero = 1000;
+ if (cmdLine.hasOption(numNonZeroOpt)) {
+ numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt));
+ }
+
+ int numVectors = 25;
+ if (cmdLine.hasOption(numVectorsOpt)) {
+ numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt));
+
+ }
+
+ int numOps = 10;
+ if (cmdLine.hasOption(numOpsOpt)) {
+ numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt));
+
+ }
+ VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps);
+ runBenchmark(mark);
+
+ // log.info("\n{}", mark);
+ log.info("\n{}", mark.asCsvString());
+ } catch (OptionException e) {
+ CommandLineUtil.printHelp(group);
+ }
+ }
+
+ private static void runBenchmark(VectorBenchmarks mark) throws IOException {
+ // Required to set up data.
+ mark.createData();
+
+ mark.createBenchmark();
+ if (mark.cardinality < 200000) {
+ // Too slow.
+ mark.incrementalCreateBenchmark();
+ }
+
+ new CloneBenchmark(mark).benchmark();
+ new DotBenchmark(mark).benchmark();
+ new PlusBenchmark(mark).benchmark();
+ new MinusBenchmark(mark).benchmark();
+ new TimesBenchmark(mark).benchmark();
+ new SerializationBenchmark(mark).benchmark();
+
+ DistanceBenchmark distanceBenchmark = new DistanceBenchmark(mark);
+ distanceBenchmark.benchmark(new CosineDistanceMeasure());
+ distanceBenchmark.benchmark(new SquaredEuclideanDistanceMeasure());
+ distanceBenchmark.benchmark(new EuclideanDistanceMeasure());
+ distanceBenchmark.benchmark(new ManhattanDistanceMeasure());
+ distanceBenchmark.benchmark(new TanimotoDistanceMeasure());
+ distanceBenchmark.benchmark(new ChebyshevDistanceMeasure());
+ distanceBenchmark.benchmark(new MinkowskiDistanceMeasure());
+
+ if (mark.numClusters > 0) {
+ ClosestCentroidBenchmark centroidBenchmark = new ClosestCentroidBenchmark(mark);
+ centroidBenchmark.benchmark(new CosineDistanceMeasure());
+ centroidBenchmark.benchmark(new SquaredEuclideanDistanceMeasure());
+ centroidBenchmark.benchmark(new EuclideanDistanceMeasure());
+ centroidBenchmark.benchmark(new ManhattanDistanceMeasure());
+ centroidBenchmark.benchmark(new TanimotoDistanceMeasure());
+ centroidBenchmark.benchmark(new ChebyshevDistanceMeasure());
+ centroidBenchmark.benchmark(new MinkowskiDistanceMeasure());
+ }
+ }
+
+ private String asCsvString() {
+ List<String> keys = new ArrayList<>(statsMap.keySet());
+ Collections.sort(keys);
+ Map<Integer,String> implMap = new HashMap<>();
+ for (Entry<String,Integer> e : implType.entrySet()) {
+ implMap.put(e.getValue(), e.getKey());
+ }
+
+ StringBuilder sb = new StringBuilder(1000);
+ for (String benchmarkName : keys) {
+ int i = 0;
+ for (String[] stats : statsMap.get(benchmarkName)) {
+ if (stats.length < 8) {
+ continue;
+ }
+ sb.append(benchmarkName).append(',');
+ sb.append(implMap.get(i++)).append(',');
+ sb.append(stats[7].trim().split("=|/")[1].trim());
+ sb.append('\n');
+ }
+ }
+ sb.append('\n');
+ return sb.toString();
+ }
+
+ @Override
+ public String toString() {
+ int pad = 24;
+ StringBuilder sb = new StringBuilder(1000);
+ sb.append(StringUtils.rightPad("BenchMarks", pad));
+ for (int i = 0; i < implType.size(); i++) {
+ for (Entry<String,Integer> e : implType.entrySet()) {
+ if (e.getValue() == i) {
+ sb.append(StringUtils.rightPad(e.getKey(), pad).substring(0, pad));
+ break;
+ }
+ }
+ }
+ sb.append('\n');
+ List<String> keys = new ArrayList<>(statsMap.keySet());
+ Collections.sort(keys);
+ for (String benchmarkName : keys) {
+ List<String[]> implTokenizedStats = statsMap.get(benchmarkName);
+ int maxStats = 0;
+ for (String[] stat : implTokenizedStats) {
+ maxStats = Math.max(maxStats, stat.length);
+ }
+
+ for (int i = 0; i < maxStats; i++) {
+ boolean printedName = false;
+ for (String[] stats : implTokenizedStats) {
+ if (i == 0 && !printedName) {
+ sb.append(StringUtils.rightPad(benchmarkName, pad));
+ printedName = true;
+ } else if (!printedName) {
+ printedName = true;
+ sb.append(StringUtils.rightPad("", pad));
+ }
+ if (stats.length > i) {
+ sb.append(StringUtils.rightPad(stats[i], pad));
+ } else {
+ sb.append(StringUtils.rightPad("", pad));
+ }
+
+ }
+ sb.append('\n');
+ }
+ sb.append('\n');
+ }
+ return sb.toString();
+ }
+
+ public BenchmarkRunner getRunner() {
+ return runner;
+ }
+}

r***@apache.org

2018-06-27 14:52:16 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java
new file mode 100644
index 0000000..3e9de2c
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLBooleanPrefJDBCDataModel.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+
+/**
+ * 
+ * See also {@link MySQLJDBCDataModel} -- same except deals with a table without preference info:
+ * 
+ *
+ * 
+ *
+ * <pre>
+ * CREATE TABLE taste_preferences (
+ * user_id BIGINT NOT NULL,
+ * item_id BIGINT NOT NULL,
+ * PRIMARY KEY (user_id, item_id),
+ * INDEX (user_id),
+ * INDEX (item_id)
+ * )
+ * </pre>
+ *
+ * 
+ */
+public class MySQLBooleanPrefJDBCDataModel extends AbstractBooleanPrefJDBCDataModel {
+
+ /**
+ * 
+ * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the default {@link javax.sql.DataSource} (named
+ * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
+ * 
+ *
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public MySQLBooleanPrefJDBCDataModel() throws TasteException {
+ this(DEFAULT_DATASOURCE_NAME);
+ }
+
+ /**
+ * 
+ * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the default {@link javax.sql.DataSource} found
+ * under the given name, and using default table/column names.
+ * 
+ *
+ * @param dataSourceName
+ * name of {@link javax.sql.DataSource} to look up
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public MySQLBooleanPrefJDBCDataModel(String dataSourceName) throws TasteException {
+ this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the given {@link javax.sql.DataSource} and default
+ * table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link javax.sql.DataSource} to use
+ */
+ public MySQLBooleanPrefJDBCDataModel(DataSource dataSource) {
+ this(dataSource,
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a {@link MySQLBooleanPrefJDBCDataModel} using the given {@link javax.sql.DataSource} and default
+ * table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link javax.sql.DataSource} to use
+ * @param preferenceTable
+ * name of table containing preference data
+ * @param userIDColumn
+ * user ID column name
+ * @param itemIDColumn
+ * item ID column name
+ * @param timestampColumn timestamp column name (may be null)
+ */
+ public MySQLBooleanPrefJDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String timestampColumn) {
+ super(dataSource, preferenceTable, userIDColumn, itemIDColumn,
+ NO_SUCH_COLUMN,
+ // getPreferenceSQL
+ "SELECT 1 FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
+ // getPreferenceTimeSQL
+ "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
+ + itemIDColumn + "=?",
+ // getUserSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
+ + userIDColumn + "=?",
+ // getAllUsersSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY "
+ + userIDColumn,
+ // getNumItemsSQL
+ "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
+ // getNumUsersSQL
+ "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
+ // setPreferenceSQL
+ "INSERT IGNORE INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ") VALUES (?,?)",
+ // removePreference SQL
+ "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
+ // getUsersSQL
+ "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
+ // getItemsSQL
+ "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
+ // getPrefsForItemSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
+ + itemIDColumn + "=? ORDER BY " + userIDColumn,
+ // getNumPreferenceForItemSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
+ // getNumPreferenceForItemsSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
+ + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
+ // getMaxPreferenceSQL
+ "SELECT 1.0",
+ // getMinPreferenceSQL
+ "SELECT 1.0");
+ }
+
+ @Override
+ protected int getFetchSize() {
+ // Need to return this for MySQL Connector/J to make it use streaming mode
+ return Integer.MIN_VALUE;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java
new file mode 100644
index 0000000..9904c7e
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/MySQLJDBCDataModel.java
@@ -0,0 +1,247 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+
+/**
+ * 
+ * A {@link org.apache.mahout.cf.taste.model.JDBCDataModel} backed by a MySQL database and
+ * accessed via JDBC. It may work with other JDBC databases. By default, this class
+ * assumes that there is a {@link DataSource} available under the JNDI name
+ * "jdbc/taste", which gives access to a database with a "taste_preferences" table with the following schema:
+ * 
+ *
+ * <table>
+ * <tr>
+ * <th>user_id</th>
+ * <th>item_id</th>
+ * <th>preference</th>
+ * </tr>
+ * <tr>
+ * <td>987</td>
+ * <td>123</td>
+ * <td>0.9</td>
+ * </tr>
+ * <tr>
+ * <td>987</td>
+ * <td>456</td>
+ * <td>0.1</td>
+ * </tr>
+ * <tr>
+ * <td>654</td>
+ * <td>123</td>
+ * <td>0.2</td>
+ * </tr>
+ * <tr>
+ * <td>654</td>
+ * <td>789</td>
+ * <td>0.3</td>
+ * </tr>
+ * </table>
+ *
+ * 
+ * {@code preference} must have a type compatible with the Java {@code float} type.
+ * {@code user_id} and {@code item_id} should be compatible with long type (BIGINT). For example,
+ * the following command sets up a suitable table in MySQL, complete with primary key and indexes:
+ * 
+ *
+ * 
+ *
+ * <pre>
+ * CREATE TABLE taste_preferences (
+ * user_id BIGINT NOT NULL,
+ * item_id BIGINT NOT NULL,
+ * preference FLOAT NOT NULL,
+ * PRIMARY KEY (user_id, item_id),
+ * INDEX (user_id),
+ * INDEX (item_id)
+ * )
+ * </pre>
+ *
+ * 
+ *
+ * The table may optionally have a {@code timestamp} column whose type is compatible with Java
+ * {@code long}.
+ *
+ * <h3>Performance Notes</h3>
+ *
+ * 
+ * See the notes in {@link AbstractJDBCDataModel} regarding using connection pooling. It's pretty vital to
+ * performance.
+ * 
+ *
+ * 
+ * Some experimentation suggests that MySQL's InnoDB engine is faster than MyISAM for these kinds of
+ * applications. While MyISAM is the default and, I believe, generally considered the lighter-weight and
+ * faster of the two engines, my guess is the row-level locking of InnoDB helps here. Your mileage may vary.
+ * 
+ *
+ * 
+ * Here are some key settings that can be tuned for MySQL, and suggested size for a data set of around 1
+ * million elements:
+ * 
+ *
+ * <ul>
+ * <li>innodb_buffer_pool_size=64M</li>
+ * <li>myisam_sort_buffer_size=64M</li>
+ * <li>query_cache_limit=64M</li>
+ * <li>query_cache_min_res_unit=512K</li>
+ * <li>query_cache_type=1</li>
+ * <li>query_cache_size=64M</li>
+ * </ul>
+ *
+ * 
+ * Also consider setting some parameters on the MySQL Connector/J driver:
+ * 
+ *
+ * <pre>
+ * cachePreparedStatements = true
+ * cachePrepStmts = true
+ * cacheResultSetMetadata = true
+ * alwaysSendSetIsolation = false
+ * elideSetAutoCommits = true
+ * </pre>
+ *
+ * 
+ * Thanks to Amila Jayasooriya for contributing MySQL notes above as part of Google Summer of Code 2007.
+ * 
+ */
+public class MySQLJDBCDataModel extends AbstractJDBCDataModel {
+
+ /**
+ * 
+ * Creates a {@link MySQLJDBCDataModel} using the default {@link DataSource} (named
+ * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
+ * 
+ *
+ * @throws TasteException
+ * if {@link DataSource} can't be found
+ */
+ public MySQLJDBCDataModel() throws TasteException {
+ this(DEFAULT_DATASOURCE_NAME);
+ }
+
+ /**
+ * 
+ * Creates a {@link MySQLJDBCDataModel} using the default {@link DataSource} found under the given name, and
+ * using default table/column names.
+ * 
+ *
+ * @param dataSourceName
+ * name of {@link DataSource} to look up
+ * @throws TasteException
+ * if {@link DataSource} can't be found
+ */
+ public MySQLJDBCDataModel(String dataSourceName) throws TasteException {
+ this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a {@link MySQLJDBCDataModel} using the given {@link DataSource} and default table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link DataSource} to use
+ */
+ public MySQLJDBCDataModel(DataSource dataSource) {
+ this(dataSource,
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a {@link MySQLJDBCDataModel} using the given {@link DataSource} and default table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link DataSource} to use
+ * @param preferenceTable
+ * name of table containing preference data
+ * @param userIDColumn
+ * user ID column name
+ * @param itemIDColumn
+ * item ID column name
+ * @param preferenceColumn
+ * preference column name
+ * @param timestampColumn timestamp column name (may be null)
+ */
+ public MySQLJDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String preferenceColumn,
+ String timestampColumn) {
+ super(dataSource, preferenceTable, userIDColumn, itemIDColumn, preferenceColumn,
+ // getPreferenceSQL
+ "SELECT " + preferenceColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
+ + itemIDColumn + "=?",
+ // getPreferenceTimeSQL
+ "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
+ + itemIDColumn + "=?",
+ // getUserSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
+ + " WHERE " + userIDColumn + "=? ORDER BY " + itemIDColumn,
+ // getAllUsersSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
+ + " ORDER BY " + userIDColumn + ", " + itemIDColumn,
+ // getNumItemsSQL
+ "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
+ // getNumUsersSQL
+ "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
+ // setPreferenceSQL
+ "INSERT INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ',' + preferenceColumn
+ + ") VALUES (?,?,?) ON DUPLICATE KEY UPDATE " + preferenceColumn + "=?",
+ // removePreference SQL
+ "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
+ // getUsersSQL
+ "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
+ // getItemsSQL
+ "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
+ // getPrefsForItemSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
+ + " WHERE " + itemIDColumn + "=? ORDER BY " + userIDColumn,
+ // getNumPreferenceForItemSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
+ // getNumPreferenceForItemsSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
+ + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
+ "SELECT MAX(" + preferenceColumn + ") FROM " + preferenceTable,
+ "SELECT MIN(" + preferenceColumn + ") FROM " + preferenceTable);
+ }
+
+ @Override
+ protected int getFetchSize() {
+ // Need to return this for MySQL Connector/J to make it use streaming mode
+ return Integer.MIN_VALUE;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java
new file mode 100644
index 0000000..6dda281
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLBooleanPrefJDBCDataModel.java
@@ -0,0 +1,146 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.sql.DataSource;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+
+/**
+ * 
+ * See also {@link org.apache.mahout.cf.taste.impl.model.jdbc.PostgreSQLJDBCDataModel} --
+ * same except deals with a table without preference info:
+ * 
+ *
+ * 
+ *
+ * <pre>
+ * CREATE TABLE taste_preferences (
+ * user_id BIGINT NOT NULL,
+ * item_id BIGINT NOT NULL,
+ * PRIMARY KEY (user_id, item_id)
+ * );
+ * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
+ * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
+ * </pre>
+ *
+ * 
+ *
+ * @see PostgreSQLJDBCDataModel
+ */
+public class PostgreSQLBooleanPrefJDBCDataModel extends SQL92BooleanPrefJDBCDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(PostgreSQLBooleanPrefJDBCDataModel.class);
+
+ private static final String POSTGRESQL_DUPLICATE_KEY_STATE = "23505"; // this is brittle...
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} (named
+ * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
+ * 
+ *
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public PostgreSQLBooleanPrefJDBCDataModel() throws TasteException {
+ }
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} found
+ * under the given name, and using default table/column names.
+ * 
+ *
+ * @param dataSourceName name of {@link javax.sql.DataSource} to look up
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public PostgreSQLBooleanPrefJDBCDataModel(String dataSourceName) throws TasteException {
+ super(dataSourceName);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default
+ * table/column names.
+ * 
+ *
+ * @param dataSource {@link javax.sql.DataSource} to use
+ */
+ public PostgreSQLBooleanPrefJDBCDataModel(DataSource dataSource) {
+ super(dataSource);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default
+ * table/column names.
+ * 
+ *
+ * @param dataSource {@link javax.sql.DataSource} to use
+ * @param preferenceTable name of table containing preference data
+ * @param userIDColumn user ID column name
+ * @param itemIDColumn item ID column name
+ * @param timestampColumn timestamp column name (may be null)
+ */
+ public PostgreSQLBooleanPrefJDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String timestampColumn) {
+ super(dataSource, preferenceTable, userIDColumn, itemIDColumn, timestampColumn);
+ }
+
+ /**
+ * Override since PostgreSQL doesn't have the same non-standard capability that MySQL has, to optionally
+ * ignore an insert that fails since the row exists already.
+ */
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+ log.debug("Setting preference for user {}, item {}", userID, itemID);
+
+ String setPreferenceSQL = getSetPreferenceSQL();
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ try {
+ conn = getDataSource().getConnection();
+ stmt = conn.prepareStatement(setPreferenceSQL);
+ setLongParameter(stmt, 1, userID);
+ setLongParameter(stmt, 2, itemID);
+ log.debug("Executing SQL update: {}", setPreferenceSQL);
+ stmt.executeUpdate();
+ } catch (SQLException sqle) {
+ if (!POSTGRESQL_DUPLICATE_KEY_STATE.equals(sqle.getSQLState())) {
+ log.warn("Exception while setting preference", sqle);
+ throw new TasteException(sqle);
+ }
+ } finally {
+ IOUtils.quietClose(null, stmt, conn);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java
new file mode 100644
index 0000000..b838430
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/PostgreSQLJDBCDataModel.java
@@ -0,0 +1,172 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.sql.DataSource;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+
+/**
+ * 
+ * A {@link org.apache.mahout.cf.taste.model.JDBCDataModel} backed by a PostgreSQL database and
+ * accessed via JDBC. It may work with other JDBC databases. By default, this class
+ * assumes that there is a {@link javax.sql.DataSource} available under the JNDI name
+ * "jdbc/taste", which gives access to a database with a "taste_preferences" table with the following schema:
+ * 
+ *
+ * 
+ *
+ * <pre>
+ * CREATE TABLE taste_preferences (
+ * user_id BIGINT NOT NULL,
+ * item_id BIGINT NOT NULL,
+ * preference REAL NOT NULL,
+ * PRIMARY KEY (user_id, item_id)
+ * )
+ * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
+ * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
+ * </pre>
+ *
+ * 
+ *
+ * @see PostgreSQLJDBCDataModel
+ */
+public class PostgreSQLJDBCDataModel extends SQL92JDBCDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(PostgreSQLJDBCDataModel.class);
+
+ private static final String POSTGRESQL_DUPLICATE_KEY_STATE = "23505"; // this is brittle...
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} (named
+ * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
+ * 
+ *
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public PostgreSQLJDBCDataModel() throws TasteException {
+ }
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} found under the given name, and
+ * using default table/column names.
+ * 
+ *
+ * @param dataSourceName name of {@link javax.sql.DataSource} to look up
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public PostgreSQLJDBCDataModel(String dataSourceName) throws TasteException {
+ super(dataSourceName);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
+ * 
+ *
+ * @param dataSource {@link javax.sql.DataSource} to use
+ */
+ public PostgreSQLJDBCDataModel(DataSource dataSource) {
+ super(dataSource);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
+ * 
+ *
+ * @param dataSource {@link javax.sql.DataSource} to use
+ * @param preferenceTable name of table containing preference data
+ * @param userIDColumn user ID column name
+ * @param itemIDColumn item ID column name
+ * @param preferenceColumn preference column name
+ * @param timestampColumn timestamp column name (may be null)
+ */
+ public PostgreSQLJDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String preferenceColumn,
+ String timestampColumn) {
+ super(dataSource, preferenceTable, userIDColumn, itemIDColumn, preferenceColumn, timestampColumn);
+ }
+
+ /**
+ * Override since PostgreSQL doesn't have the same non-standard capability that MySQL has, to optionally
+ * insert or update in one statement.
+ */
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+
+ log.debug("Setting preference for user {}, item {}", userID, itemID);
+
+ String setPreferenceSQL = getSetPreferenceSQL();
+
+ Connection conn = null;
+ PreparedStatement stmt1 = null;
+ PreparedStatement stmt2 = null;
+ try {
+ conn = getDataSource().getConnection();
+
+ stmt1 = conn.prepareStatement(setPreferenceSQL);
+ setLongParameter(stmt1, 1, userID);
+ setLongParameter(stmt1, 2, itemID);
+ stmt1.setDouble(3, value);
+
+ log.debug("Executing SQL update: {}", setPreferenceSQL);
+ try {
+ stmt1.executeUpdate();
+ } catch (SQLException sqle) {
+ if (!POSTGRESQL_DUPLICATE_KEY_STATE.equals(sqle.getSQLState())) {
+ throw sqle;
+ }
+ }
+
+ // Continue with update; just found the key already exists
+
+ stmt2 = conn.prepareStatement(getUpdatePreferenceSQL());
+ stmt2.setDouble(1, value);
+ setLongParameter(stmt2, 2, userID);
+ setLongParameter(stmt2, 3, itemID);
+
+ log.debug("Executing SQL update: {}", getUpdatePreferenceSQL());
+ stmt2.executeUpdate();
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while setting preference", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(null, stmt1, null);
+ IOUtils.quietClose(null, stmt2, null);
+ IOUtils.quietClose(null, null, conn);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java
new file mode 100644
index 0000000..0827416
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ReloadFromJDBCDataModel.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.JDBCDataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+/**
+ * A {@link DataModel} which loads, and can re-load, data from a JDBC-backed {@link JDBCDataModel} into memory, as a
+ * {@link GenericDataModel} or {@link GenericBooleanPrefDataModel}. It is intended to provide the speed
+ * advantage of in-memory representation but be able to update periodically to pull in new data from a database source.
+ */
+public final class ReloadFromJDBCDataModel implements DataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(ReloadFromJDBCDataModel.class);
+
+ private DataModel delegateInMemory;
+ private final JDBCDataModel delegate;
+ private final RefreshHelper refreshHelper;
+
+ public ReloadFromJDBCDataModel(JDBCDataModel delegate) throws TasteException {
+ this.delegate = Preconditions.checkNotNull(delegate);
+ refreshHelper = new RefreshHelper(new Callable<Void>() {
+ @Override
+ public Void call() {
+ reload();
+ return null; //To change body of implemented methods use File | Settings | File Templates.
+ }
+ });
+ refreshHelper.addDependency(delegate);
+ reload();
+ if (delegateInMemory == null) {
+ throw new TasteException("Failed to load data into memory");
+ }
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ private void reload() {
+ try {
+ // Load new in-memory representation,
+ log.info("Loading new JDBC delegate data...");
+ DataModel newDelegateInMemory =
+ delegate.hasPreferenceValues()
+ ? new GenericDataModel(delegate.exportWithPrefs())
+ : new GenericBooleanPrefDataModel(delegate.exportWithIDsOnly());
+ // and then swap to it.
+ log.info("New data loaded.");
+ delegateInMemory = newDelegateInMemory;
+ } catch (TasteException te) {
+ log.warn("Error while reloading JDBC delegate data", te);
+ // But continue with whatever is loaded
+ }
+ }
+
+ public JDBCDataModel getDelegate() {
+ return delegate;
+ }
+
+ public DataModel getDelegateInMemory() {
+ return delegateInMemory;
+ }
+
+ // Delegated methods:
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() throws TasteException {
+ return delegateInMemory.getUserIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long id) throws TasteException {
+ return delegateInMemory.getPreferencesFromUser(id);
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long id) throws TasteException {
+ return delegateInMemory.getItemIDsFromUser(id);
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ return delegateInMemory.getPreferenceValue(userID, itemID);
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ return delegateInMemory.getPreferenceTime(userID, itemID);
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() throws TasteException {
+ return delegateInMemory.getItemIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ return delegateInMemory.getPreferencesForItem(itemID);
+ }
+
+ @Override
+ public int getNumItems() throws TasteException {
+ return delegateInMemory.getNumItems();
+ }
+
+ @Override
+ public int getNumUsers() throws TasteException {
+ return delegateInMemory.getNumUsers();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ return delegateInMemory.getNumUsersWithPreferenceFor(itemID);
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ return delegateInMemory.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ delegateInMemory.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ delegateInMemory.removePreference(userID, itemID);
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return delegateInMemory.hasPreferenceValues();
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return delegateInMemory.getMaxPreference();
+ }
+
+ @Override
+ public float getMinPreference() {
+ return delegateInMemory.getMinPreference();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java
new file mode 100644
index 0000000..19c575f
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92BooleanPrefJDBCDataModel.java
@@ -0,0 +1,221 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.sql.DataSource;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+/**
+ * 
+ * See also {@link org.apache.mahout.cf.taste.impl.model.jdbc.SQL92JDBCDataModel} --
+ * same except deals with a table without preference info:
+ * 
+ *
+ * 
+ *
+ * <pre>
+ * CREATE TABLE taste_preferences (
+ * user_id BIGINT NOT NULL,
+ * item_id BIGINT NOT NULL,
+ * PRIMARY KEY (user_id, item_id)
+ * );
+ * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
+ * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
+ * </pre>
+ *
+ * 
+ *
+ * @see SQL92JDBCDataModel
+ */
+public class SQL92BooleanPrefJDBCDataModel extends AbstractBooleanPrefJDBCDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(SQL92BooleanPrefJDBCDataModel.class);
+
+ private final String verifyPreferenceSQL;
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} (named
+ * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
+ * 
+ *
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public SQL92BooleanPrefJDBCDataModel() throws TasteException {
+ this(DEFAULT_DATASOURCE_NAME);
+ }
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} found
+ * under the given name, and using default table/column names.
+ * 
+ *
+ * @param dataSourceName
+ * name of {@link javax.sql.DataSource} to look up
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public SQL92BooleanPrefJDBCDataModel(String dataSourceName) throws TasteException {
+ this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default
+ * table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link javax.sql.DataSource} to use
+ */
+ public SQL92BooleanPrefJDBCDataModel(DataSource dataSource) {
+ this(dataSource,
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default
+ * table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link javax.sql.DataSource} to use
+ * @param preferenceTable
+ * name of table containing preference data
+ * @param userIDColumn
+ * user ID column name
+ * @param itemIDColumn
+ * item ID column name
+ * @param timestampColumn timestamp column name (may be null)
+ */
+ public SQL92BooleanPrefJDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String timestampColumn) {
+ super(dataSource, preferenceTable, userIDColumn, itemIDColumn,
+ NO_SUCH_COLUMN,
+ // getPreferenceSQL
+ "SELECT 1 FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
+ // getPreferenceTimeSQL
+ "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
+ + itemIDColumn + "=?",
+ // getUserSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
+ + userIDColumn + "=?",
+ // getAllUsersSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY "
+ + userIDColumn,
+ // getNumItemsSQL
+ "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
+ // getNumUsersSQL
+ "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
+ // setPreferenceSQL
+ "INSERT INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ") VALUES (?,?)",
+ // removePreference SQL
+ "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
+ // getUsersSQL
+ "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
+ // getItemsSQL
+ "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
+ // getPrefsForItemSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + " FROM " + preferenceTable + " WHERE "
+ + itemIDColumn + "=? ORDER BY " + userIDColumn,
+ // getNumPreferenceForItemSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
+ // getNumPreferenceForItemsSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
+ + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
+ // getMaxPreferenceSQL
+ "SELECT 1.0",
+ // getMinPreferenceSQL
+ "SELECT 1.0");
+
+ verifyPreferenceSQL = "SELECT 1 FROM " + preferenceTable + " WHERE " + userIDColumn
+ + "=? AND " + itemIDColumn + "=?";
+ }
+
+ protected String getVerifyPreferenceSQL() {
+ return verifyPreferenceSQL;
+ }
+
+ /**
+ * Override since PostgreSQL doesn't have the same non-standard capability that MySQL has, to optionally
+ * ignore an insert that fails since the row exists already.
+ */
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+ log.debug("Setting preference for user {}, item {}", userID, itemID);
+
+ String setPreferenceSQL = getSetPreferenceSQL();
+
+ Connection conn = null;
+ PreparedStatement stmt1 = null;
+ PreparedStatement stmt2 = null;
+ ResultSet rs = null;
+ try {
+ conn = getDataSource().getConnection();
+
+ stmt1 = conn.prepareStatement(verifyPreferenceSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ setLongParameter(stmt1, 1, userID);
+ setLongParameter(stmt1, 2, itemID);
+ rs = stmt1.executeQuery();
+
+ // test if the record exists already.
+ if (!rs.first()) {
+ stmt2 = conn.prepareStatement(setPreferenceSQL);
+ setLongParameter(stmt2, 1, userID);
+ setLongParameter(stmt2, 2, itemID);
+ stmt2.setDouble(3, value);
+
+ log.debug("Executing SQL update: {}", setPreferenceSQL);
+ stmt2.executeUpdate();
+ }
+ } catch (SQLException sqle) {
+ log.warn("Exception while setting preference", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs);
+ IOUtils.quietClose(stmt1);
+ IOUtils.quietClose(stmt2);
+ IOUtils.quietClose(conn);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java
new file mode 100644
index 0000000..39de620
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/SQL92JDBCDataModel.java
@@ -0,0 +1,248 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.sql.DataSource;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+/**
+ * 
+ * A {@link org.apache.mahout.cf.taste.model.JDBCDataModel} backed by a SQL92 compatible database and
+ * accessed via JDBC. It should work with most JDBC databases, although not optimized for performance.
+ * By default, this class assumes that there is a {@link javax.sql.DataSource} available under the JNDI name
+ * "jdbc/taste", which gives access to a database with a "taste_preferences" table with the following schema:
+ * 
+ *
+ * 
+ *
+ * <pre>
+ * CREATE TABLE taste_preferences (
+ * user_id BIGINT NOT NULL,
+ * item_id BIGINT NOT NULL,
+ * preference REAL NOT NULL,
+ * PRIMARY KEY (user_id, item_id)
+ * )
+ * CREATE INDEX taste_preferences_user_id_index ON taste_preferences (user_id);
+ * CREATE INDEX taste_preferences_item_id_index ON taste_preferences (item_id);
+ * </pre>
+ *
+ * 
+ *
+ * @see SQL92BooleanPrefJDBCDataModel
+ */
+public class SQL92JDBCDataModel extends AbstractJDBCDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(SQL92JDBCDataModel.class);
+
+ private final String updatePreferenceSQL;
+ private final String verifyPreferenceSQL;
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} (named
+ * {@link #DEFAULT_DATASOURCE_NAME} and default table/column names.
+ * 
+ *
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public SQL92JDBCDataModel() throws TasteException {
+ this(DEFAULT_DATASOURCE_NAME);
+ }
+
+ /**
+ * 
+ * Creates a using the default {@link javax.sql.DataSource} found under the given name, and
+ * using default table/column names.
+ * 
+ *
+ * @param dataSourceName
+ * name of {@link javax.sql.DataSource} to look up
+ * @throws org.apache.mahout.cf.taste.common.TasteException
+ * if {@link javax.sql.DataSource} can't be found
+ */
+ public SQL92JDBCDataModel(String dataSourceName) throws TasteException {
+ this(AbstractJDBCComponent.lookupDataSource(dataSourceName),
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link javax.sql.DataSource} to use
+ */
+ public SQL92JDBCDataModel(DataSource dataSource) {
+ this(dataSource,
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_COLUMN,
+ DEFAULT_PREFERENCE_TIME_COLUMN);
+ }
+
+ /**
+ * 
+ * Creates a using the given {@link javax.sql.DataSource} and default table/column names.
+ * 
+ *
+ * @param dataSource
+ * {@link javax.sql.DataSource} to use
+ * @param preferenceTable
+ * name of table containing preference data
+ * @param userIDColumn
+ * user ID column name
+ * @param itemIDColumn
+ * item ID column name
+ * @param preferenceColumn
+ * preference column name
+ * @param timestampColumn timestamp column name (may be null)
+ */
+ public SQL92JDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String preferenceColumn,
+ String timestampColumn) {
+ super(dataSource, preferenceTable, userIDColumn, itemIDColumn, preferenceColumn,
+ // getPreferenceSQL
+ "SELECT " + preferenceColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
+ + itemIDColumn + "=?",
+ // getPreferenceTimeSQL
+ "SELECT " + timestampColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND "
+ + itemIDColumn + "=?",
+ // getUserSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
+ + " WHERE " + userIDColumn + "=? ORDER BY " + itemIDColumn,
+ // getAllUsersSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
+ + " ORDER BY " + userIDColumn + ", " + itemIDColumn,
+ // getNumItemsSQL
+ "SELECT COUNT(DISTINCT " + itemIDColumn + ") FROM " + preferenceTable,
+ // getNumUsersSQL
+ "SELECT COUNT(DISTINCT " + userIDColumn + ") FROM " + preferenceTable,
+ // setPreferenceSQL
+ "INSERT INTO " + preferenceTable + '(' + userIDColumn + ',' + itemIDColumn + ',' + preferenceColumn
+ + ") VALUES (?,?,?)",
+ // removePreference SQL
+ "DELETE FROM " + preferenceTable + " WHERE " + userIDColumn + "=? AND " + itemIDColumn + "=?",
+ // getUsersSQL
+ "SELECT DISTINCT " + userIDColumn + " FROM " + preferenceTable + " ORDER BY " + userIDColumn,
+ // getItemsSQL
+ "SELECT DISTINCT " + itemIDColumn + " FROM " + preferenceTable + " ORDER BY " + itemIDColumn,
+ // getPrefsForItemSQL
+ "SELECT DISTINCT " + userIDColumn + ", " + itemIDColumn + ", " + preferenceColumn + " FROM " + preferenceTable
+ + " WHERE " + itemIDColumn + "=? ORDER BY " + userIDColumn,
+ // getNumPreferenceForItemSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " WHERE " + itemIDColumn + "=?",
+ // getNumPreferenceForItemsSQL
+ "SELECT COUNT(1) FROM " + preferenceTable + " tp1 JOIN " + preferenceTable + " tp2 " + "USING ("
+ + userIDColumn + ") WHERE tp1." + itemIDColumn + "=? and tp2." + itemIDColumn + "=?",
+ // getMaxPreferenceSQL
+ "SELECT MAX(" + preferenceColumn + ") FROM " + preferenceTable,
+ // getMinPreferenceSQL
+ "SELECT MIN(" + preferenceColumn + ") FROM " + preferenceTable);
+
+ updatePreferenceSQL = "UPDATE " + preferenceTable + " SET " + preferenceColumn + "=? WHERE " + userIDColumn
+ + "=? AND " + itemIDColumn + "=?";
+ verifyPreferenceSQL = "SELECT " + preferenceColumn + " FROM " + preferenceTable + " WHERE " + userIDColumn
+ + "=? AND " + itemIDColumn + "=?";
+ }
+
+ protected String getUpdatePreferenceSQL() {
+ return updatePreferenceSQL;
+ }
+
+ protected String getVerifyPreferenceSQL() {
+ return verifyPreferenceSQL;
+ }
+
+ /**
+ * Override since SQL92 doesn't have the same non-standard capability that MySQL has, to optionally
+ * insert or update in one statement.
+ */
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+ log.debug("Setting preference for user {}, item {}", userID, itemID);
+
+ String setPreferenceSQL = getSetPreferenceSQL();
+
+ Connection conn = null;
+ PreparedStatement stmt1 = null;
+ PreparedStatement stmt2 = null;
+ PreparedStatement stmt3 = null;
+ ResultSet rs = null;
+ try {
+ conn = getDataSource().getConnection();
+
+ stmt1 = conn.prepareStatement(verifyPreferenceSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ setLongParameter(stmt1, 1, userID);
+ setLongParameter(stmt1, 2, itemID);
+ rs = stmt1.executeQuery();
+
+ // test if the record exists already.
+ if (rs.first()) {
+ // then we update the record.
+ stmt2 = conn.prepareStatement(updatePreferenceSQL);
+ stmt2.setDouble(1, value);
+ setLongParameter(stmt2, 2, userID);
+ setLongParameter(stmt2, 3, itemID);
+
+ log.debug("Executing SQL update: {}", updatePreferenceSQL);
+ stmt2.executeUpdate();
+
+ } else {
+ // we'll insert the record
+ stmt3 = conn.prepareStatement(setPreferenceSQL);
+ setLongParameter(stmt3, 1, userID);
+ setLongParameter(stmt3, 2, itemID);
+ stmt3.setDouble(3, value);
+
+ log.debug("Executing SQL update: {}", setPreferenceSQL);
+ stmt3.executeUpdate();
+ }
+ } catch (SQLException sqle) {
+ log.warn("Exception while setting preference", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs);
+ IOUtils.quietClose(stmt1);
+ IOUtils.quietClose(stmt2);
+ IOUtils.quietClose(stmt3);
+ IOUtils.quietClose(conn);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java
new file mode 100644
index 0000000..92a4019
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java
@@ -0,0 +1,873 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.mongodb;
+
+import com.google.common.base.Preconditions;
+import com.mongodb.BasicDBObject;
+import com.mongodb.DB;
+import com.mongodb.DBCollection;
+import com.mongodb.DBCursor;
+import com.mongodb.DBObject;
+import com.mongodb.Mongo;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.bson.types.ObjectId;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.UnknownHostException;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.locks.ReentrantLock;
+import java.util.regex.Pattern;
+
+/**
+ * A {@link DataModel} backed by a MongoDB database. This class expects a
+ * collection in the database which contains a user ID ({@code long} or
+ * {@link ObjectId}), item ID ({@code long} or
+ * {@link ObjectId}), preference value (optional) and timestamps
+ * ("created_at", "deleted_at").
+ *
+ * An example of a document in MongoDB:
+ *
+ * {@code { "_id" : ObjectId("4d7627bf6c7d47ade9fc7780"),
+ * "user_id" : ObjectId("4c2209fef3924d31102bd84b"),
+ * "item_id" : ObjectId(4c2209fef3924d31202bd853),
+ * "preference" : 0.5,
+ * "created_at" : "Tue Mar 23 2010 20:48:43 GMT-0400 (EDT)" }
+ * }
+ *
+ * Preference value is optional to accommodate applications that have no notion
+ * of a preference value (that is, the user simply expresses a preference for
+ * an item, but no degree of preference).
+ *
+ * The preference value is assumed to be parseable as a {@code double}.
+ *
+ * The user IDs and item IDs are assumed to be parseable as {@code long}s
+ * or {@link ObjectId}s. In case of {@link ObjectId}s, the
+ * model creates a {@code Map<ObjectId>}, {@code long}>
+ * (collection "mongo_data_model_map") inside the MongoDB database. This
+ * conversion is needed since Mahout uses the long datatype to feed the
+ * recommender, and MongoDB uses 12 bytes to create its identifiers.
+ *
+ * The timestamps ("created_at", "deleted_at"), if present, are assumed to be
+ * parseable as a {@code long} or {@link Date}. To express
+ * timestamps as {@link Date}s, a {@link DateFormat}
+ * must be provided in the class constructor. The default Date format is
+ * {@code "EE MMM dd yyyy HH:mm:ss 'GMT'Z (zzz)"}. If this parameter
+ * is set to null, timestamps are assumed to be parseable as {@code long}s.
+ * 
+ *
+ * It is also acceptable for the documents to contain additional fields.
+ * Those fields will be ignored.
+ *
+ * This class will reload data from the MondoDB database when
+ * {@link #refresh(Collection)} is called. MongoDBDataModel keeps the
+ * timestamp of the last update. This variable and the fields "created_at"
+ * and "deleted_at" help the model to determine if the triple
+ * (user, item, preference) must be added or deleted.
+ */
+public final class MongoDBDataModel implements DataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(MongoDBDataModel.class);
+
+ /** Default MongoDB host. Default: localhost */
+ private static final String DEFAULT_MONGO_HOST = "localhost";
+
+ /** Default MongoDB port. Default: 27017 */
+ private static final int DEFAULT_MONGO_PORT = 27017;
+
+ /** Default MongoDB database. Default: recommender */
+ private static final String DEFAULT_MONGO_DB = "recommender";
+
+ /**
+ * Default MongoDB authentication flag.
+ * Default: false (authentication is not required)
+ */
+ private static final boolean DEFAULT_MONGO_AUTH = false;
+
+ /** Default MongoDB user. Default: recommender */
+ private static final String DEFAULT_MONGO_USERNAME = "recommender";
+
+ /** Default MongoDB password. Default: recommender */
+ private static final String DEFAULT_MONGO_PASSWORD = "recommender";
+
+ /** Default MongoDB table/collection. Default: items */
+ private static final String DEFAULT_MONGO_COLLECTION = "items";
+
+ /**
+ * Default MongoDB update flag. When this flag is activated, the
+ * DataModel updates both model and database. Default: true
+ */
+ private static final boolean DEFAULT_MONGO_MANAGE = true;
+
+ /** Default MongoDB user ID field. Default: user_id */
+ private static final String DEFAULT_MONGO_USER_ID = "user_id";
+
+ /** Default MongoDB item ID field. Default: item_id */
+ private static final String DEFAULT_MONGO_ITEM_ID = "item_id";
+
+ /** Default MongoDB preference value field. Default: preference */
+ private static final String DEFAULT_MONGO_PREFERENCE = "preference";
+
+ /** Default MongoDB final remove flag. Default: false */
+ private static final boolean DEFAULT_MONGO_FINAL_REMOVE = false;
+
+ /**
+ * Default MongoDB date format.
+ * Default: "EE MMM dd yyyy HH:mm:ss 'GMT'Z (zzz)"
+ */
+ private static final DateFormat DEFAULT_DATE_FORMAT =
+ new SimpleDateFormat("EE MMM dd yyyy HH:mm:ss 'GMT'Z (zzz)", Locale.ENGLISH);
+
+ public static final String DEFAULT_MONGO_MAP_COLLECTION = "mongo_data_model_map";
+
+ private static final Pattern ID_PATTERN = Pattern.compile("[a-f0-9]{24}");
+
+ /** MongoDB host */
+ private String mongoHost = DEFAULT_MONGO_HOST;
+ /** MongoDB port */
+ private int mongoPort = DEFAULT_MONGO_PORT;
+ /** MongoDB database */
+ private String mongoDB = DEFAULT_MONGO_DB;
+ /**
+ * MongoDB authentication flag. If this flag is set to false,
+ * authentication is not required.
+ */
+ private boolean mongoAuth = DEFAULT_MONGO_AUTH;
+ /** MongoDB user */
+ private String mongoUsername = DEFAULT_MONGO_USERNAME;
+ /** MongoDB pass */
+ private String mongoPassword = DEFAULT_MONGO_PASSWORD;
+ /** MongoDB table/collection */
+ private String mongoCollection = DEFAULT_MONGO_COLLECTION;
+ /** MongoDB mapping table/collection */
+ private String mongoMapCollection = DEFAULT_MONGO_MAP_COLLECTION;
+ /**
+ * MongoDB update flag. When this flag is activated, the
+ * DataModel updates both model and database
+ */
+ private boolean mongoManage = DEFAULT_MONGO_MANAGE;
+ /** MongoDB user ID field */
+ private String mongoUserID = DEFAULT_MONGO_USER_ID;
+ /** MongoDB item ID field */
+ private String mongoItemID = DEFAULT_MONGO_ITEM_ID;
+ /** MongoDB preference value field */
+ private String mongoPreference = DEFAULT_MONGO_PREFERENCE;
+ /** MongoDB final remove flag. Default: false */
+ private boolean mongoFinalRemove = DEFAULT_MONGO_FINAL_REMOVE;
+ /** MongoDB date format */
+ private DateFormat dateFormat = DEFAULT_DATE_FORMAT;
+ private DBCollection collection;
+ private DBCollection collectionMap;
+ private Date mongoTimestamp;
+ private final ReentrantLock reloadLock;
+ private DataModel delegate;
+ private boolean userIsObject;
+ private boolean itemIsObject;
+ private boolean preferenceIsString;
+ private long idCounter;
+
+ /**
+ * Creates a new MongoDBDataModel
+ */
+ public MongoDBDataModel() throws UnknownHostException {
+ this.reloadLock = new ReentrantLock();
+ buildModel();
+ }
+
+ /**
+ * Creates a new MongoDBDataModel with MongoDB basic configuration
+ * (without authentication)
+ *
+ * @param host MongoDB host.
+ * @param port MongoDB port. Default: 27017
+ * @param database MongoDB database
+ * @param collection MongoDB collection/table
+ * @param manage If true, the model adds and removes users and items
+ * from MongoDB database when the model is refreshed.
+ * @param finalRemove If true, the model removes the user/item completely
+ * from the MongoDB database. If false, the model adds the "deleted_at"
+ * field with the current date to the "deleted" user/item.
+ * @param format MongoDB date format. If null, the model uses timestamps.
+ * @throws UnknownHostException if the database host cannot be resolved
+ */
+ public MongoDBDataModel(String host,
+ int port,
+ String database,
+ String collection,
+ boolean manage,
+ boolean finalRemove,
+ DateFormat format) throws UnknownHostException {
+ mongoHost = host;
+ mongoPort = port;
+ mongoDB = database;
+ mongoCollection = collection;
+ mongoManage = manage;
+ mongoFinalRemove = finalRemove;
+ dateFormat = format;
+ this.reloadLock = new ReentrantLock();
+ buildModel();
+ }
+
+ /**
+ * Creates a new MongoDBDataModel with MongoDB advanced configuration
+ * (without authentication)
+ *
+ * @param userIDField Mongo user ID field
+ * @param itemIDField Mongo item ID field
+ * @param preferenceField Mongo preference value field
+ * @throws UnknownHostException if the database host cannot be resolved
+ * @see #MongoDBDataModel(String, int, String, String, boolean, boolean, DateFormat)
+ */
+ public MongoDBDataModel(String host,
+ int port,
+ String database,
+ String collection,
+ boolean manage,
+ boolean finalRemove,
+ DateFormat format,
+ String userIDField,
+ String itemIDField,
+ String preferenceField,
+ String mappingCollection) throws UnknownHostException {
+ mongoHost = host;
+ mongoPort = port;
+ mongoDB = database;
+ mongoCollection = collection;
+ mongoManage = manage;
+ mongoFinalRemove = finalRemove;
+ dateFormat = format;
+ mongoUserID = userIDField;
+ mongoItemID = itemIDField;
+ mongoPreference = preferenceField;
+ mongoMapCollection = mappingCollection;
+ this.reloadLock = new ReentrantLock();
+ buildModel();
+ }
+
+ /**
+ * Creates a new MongoDBDataModel with MongoDB basic configuration
+ * (with authentication)
+ *
+ * @param user Mongo username (authentication)
+ * @param password Mongo password (authentication)
+ * @throws UnknownHostException if the database host cannot be resolved
+ * @see #MongoDBDataModel(String, int, String, String, boolean, boolean, DateFormat)
+ */
+ public MongoDBDataModel(String host,
+ int port,
+ String database,
+ String collection,
+ boolean manage,
+ boolean finalRemove,
+ DateFormat format,
+ String user,
+ String password) throws UnknownHostException {
+ mongoHost = host;
+ mongoPort = port;
+ mongoDB = database;
+ mongoCollection = collection;
+ mongoManage = manage;
+ mongoFinalRemove = finalRemove;
+ dateFormat = format;
+ mongoAuth = true;
+ mongoUsername = user;
+ mongoPassword = password;
+ this.reloadLock = new ReentrantLock();
+ buildModel();
+ }
+
+ /**
+ * Creates a new MongoDBDataModel with MongoDB advanced configuration
+ * (with authentication)
+ *
+ * @throws UnknownHostException if the database host cannot be resolved
+ * @see #MongoDBDataModel(String, int, String, String, boolean, boolean, DateFormat, String, String)
+ */
+ public MongoDBDataModel(String host,
+ int port,
+ String database,
+ String collection,
+ boolean manage,
+ boolean finalRemove,
+ DateFormat format,
+ String user,
+ String password,
+ String userIDField,
+ String itemIDField,
+ String preferenceField,
+ String mappingCollection) throws UnknownHostException {
+ mongoHost = host;
+ mongoPort = port;
+ mongoDB = database;
+ mongoCollection = collection;
+ mongoManage = manage;
+ mongoFinalRemove = finalRemove;
+ dateFormat = format;
+ mongoAuth = true;
+ mongoUsername = user;
+ mongoPassword = password;
+ mongoUserID = userIDField;
+ mongoItemID = itemIDField;
+ mongoPreference = preferenceField;
+ mongoMapCollection = mappingCollection;
+ this.reloadLock = new ReentrantLock();
+ buildModel();
+ }
+
+ /**
+ * 
+ * Adds/removes (user, item) pairs to/from the model.
+ * 
+ *
+ * @param userID MongoDB user identifier
+ * @param items List of pairs (item, preference) which want to be added or
+ * deleted
+ * @param add If true, this flag indicates that the pairs (user, item)
+ * must be added to the model. If false, it indicates deletion.
+ * @see #refresh(Collection)
+ */
+ public void refreshData(String userID,
+ Iterable<List<String>> items,
+ boolean add) throws NoSuchUserException, NoSuchItemException {
+ checkData(userID, items, add);
+ long id = Long.parseLong(fromIdToLong(userID, true));
+ for (List<String> item : items) {
+ item.set(0, fromIdToLong(item.get(0), false));
+ }
+ if (reloadLock.tryLock()) {
+ try {
+ if (add) {
+ delegate = addUserItem(id, items);
+ } else {
+ delegate = removeUserItem(id, items);
+ }
+ } finally {
+ reloadLock.unlock();
+ }
+ }
+ }
+
+
+ /**
+ * 
+ * Triggers "refresh" -- whatever that means -- of the implementation.
+ * The general contract is that any should always leave itself in a
+ * consistent, operational state, and that the refresh atomically updates
+ * internal state from old to new.
+ * 
+ *
+ * @param alreadyRefreshed s that are known to have already been refreshed as
+ * a result of an initial call to a method on some object. This ensures
+ * that objects in a refresh dependency graph aren't refreshed twice
+ * needlessly.
+ * @see #refreshData(String, Iterable, boolean)
+ */
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ BasicDBObject query = new BasicDBObject();
+ query.put("deleted_at", new BasicDBObject("$gt", mongoTimestamp));
+ DBCursor cursor = collection.find(query);
+ Date ts = new Date(0);
+ while (cursor.hasNext()) {
+ Map<String,Object> user = (Map<String,Object>) cursor.next().toMap();
+ String userID = getID(user.get(mongoUserID), true);
+ Collection<List<String>> items = new ArrayList<>();
+ List<String> item = new ArrayList<>();
+ item.add(getID(user.get(mongoItemID), false));
+ item.add(Float.toString(getPreference(user.get(mongoPreference))));
+ items.add(item);
+ try {
+ refreshData(userID, items, false);
+ } catch (NoSuchUserException e) {
+ log.warn("No such user ID: {}", userID);
+ } catch (NoSuchItemException e) {
+ log.warn("No such items: {}", items);
+ }
+ if (ts.compareTo(getDate(user.get("created_at"))) < 0) {
+ ts = getDate(user.get("created_at"));
+ }
+ }
+ query = new BasicDBObject();
+ query.put("created_at", new BasicDBObject("$gt", mongoTimestamp));
+ cursor = collection.find(query);
+ while (cursor.hasNext()) {
+ Map<String,Object> user = (Map<String,Object>) cursor.next().toMap();
+ if (!user.containsKey("deleted_at")) {
+ String userID = getID(user.get(mongoUserID), true);
+ Collection<List<String>> items = new ArrayList<>();
+ List<String> item = new ArrayList<>();
+ item.add(getID(user.get(mongoItemID), false));
+ item.add(Float.toString(getPreference(user.get(mongoPreference))));
+ items.add(item);
+ try {
+ refreshData(userID, items, true);
+ } catch (NoSuchUserException e) {
+ log.warn("No such user ID: {}", userID);
+ } catch (NoSuchItemException e) {
+ log.warn("No such items: {}", items);
+ }
+ if (ts.compareTo(getDate(user.get("created_at"))) < 0) {
+ ts = getDate(user.get("created_at"));
+ }
+ }
+ }
+ if (mongoTimestamp.compareTo(ts) < 0) {
+ mongoTimestamp = ts;
+ }
+ }
+
+ /**
+ * 
+ * Translates the MongoDB identifier to Mahout/MongoDBDataModel's internal
+ * identifier, if required.
+ * 
+ * 
+ * If MongoDB identifiers are long datatypes, it returns the id.
+ * 
+ * 
+ * This conversion is needed since Mahout uses the long datatype to feed the
+ * recommender, and MongoDB uses 12 bytes to create its identifiers.
+ * 
+ *
+ * @param id MongoDB identifier
+ * @param isUser
+ * @return String containing the translation of the external MongoDB ID to
+ * internal long ID (mapping).
+ * @see #fromLongToId(long)
+ * @see <a href="http://www.mongodb.org/display/DOCS/Object%20IDs">
+ * Mongo Object IDs</a>
+ */
+ public String fromIdToLong(String id, boolean isUser) {
+ DBObject objectIdLong = collectionMap.findOne(new BasicDBObject("element_id", id));
+ if (objectIdLong != null) {
+ Map<String,Object> idLong = (Map<String,Object>) objectIdLong.toMap();
+ Object value = idLong.get("long_value");
+ return value == null ? null : value.toString();
+ } else {
+ objectIdLong = new BasicDBObject();
+ String longValue = Long.toString(idCounter++);
+ objectIdLong.put("element_id", id);
+ objectIdLong.put("long_value", longValue);
+ collectionMap.insert(objectIdLong);
+ log.info("Adding Translation {}: {} long_value: {}",
+ isUser ? "User ID" : "Item ID", id, longValue);
+ return longValue;
+ }
+ }
+
+ /**
+ * 
+ * Translates the Mahout/MongoDBDataModel's internal identifier to MongoDB
+ * identifier, if required.
+ * 
+ * 
+ * If MongoDB identifiers are long datatypes, it returns the id in String
+ * format.
+ * 
+ * 
+ * This conversion is needed since Mahout uses the long datatype to feed the
+ * recommender, and MongoDB uses 12 bytes to create its identifiers.
+ * 
+ *
+ * @param id Mahout's internal identifier
+ * @return String containing the translation of the internal long ID to
+ * external MongoDB ID (mapping).
+ * @see #fromIdToLong(String, boolean)
+ * @see <a href="http://www.mongodb.org/display/DOCS/Object%20IDs">
+ * Mongo Object IDs</a>
+ */
+ public String fromLongToId(long id) {
+ DBObject objectIdLong = collectionMap.findOne(new BasicDBObject("long_value", Long.toString(id)));
+ Map<String,Object> idLong = (Map<String,Object>) objectIdLong.toMap();
+ Object value = idLong.get("element_id");
+ return value == null ? null : value.toString();
+ }
+
+ /**
+ * 
+ * Checks if an ID is currently in the model.
+ * 
+ *
+ * @param ID user or item ID
+ * @return true: if ID is into the model; false: if it's not.
+ */
+ public boolean isIDInModel(String ID) {
+ DBObject objectIdLong = collectionMap.findOne(new BasicDBObject("element_id", ID));
+ return objectIdLong != null;
+ }
+
+ /**
+ * 
+ * Date of the latest update of the model.
+ * 
+ *
+ * @return Date with the latest update of the model.
+ */
+ public Date mongoUpdateDate() {
+ return mongoTimestamp;
+ }
+
+ private void buildModel() throws UnknownHostException {
+ userIsObject = false;
+ itemIsObject = false;
+ idCounter = 0;
+ preferenceIsString = true;
+ Mongo mongoDDBB = new Mongo(mongoHost, mongoPort);
+ DB db = mongoDDBB.getDB(mongoDB);
+ mongoTimestamp = new Date(0);
+ FastByIDMap<Collection<Preference>> userIDPrefMap = new FastByIDMap<>();
+ if (!mongoAuth || db.authenticate(mongoUsername, mongoPassword.toCharArray())) {
+ collection = db.getCollection(mongoCollection);
+ collectionMap = db.getCollection(mongoMapCollection);
+ DBObject indexObj = new BasicDBObject();
+ indexObj.put("element_id", 1);
+ collectionMap.ensureIndex(indexObj);
+ indexObj = new BasicDBObject();
+ indexObj.put("long_value", 1);
+ collectionMap.ensureIndex(indexObj);
+ collectionMap.remove(new BasicDBObject());
+ DBCursor cursor = collection.find();
+ while (cursor.hasNext()) {
+ Map<String,Object> user = (Map<String,Object>) cursor.next().toMap();
+ if (!user.containsKey("deleted_at")) {
+ long userID = Long.parseLong(fromIdToLong(getID(user.get(mongoUserID), true), true));
+ long itemID = Long.parseLong(fromIdToLong(getID(user.get(mongoItemID), false), false));
+ float ratingValue = getPreference(user.get(mongoPreference));
+ Collection<Preference> userPrefs = userIDPrefMap.get(userID);
+ if (userPrefs == null) {
+ userPrefs = new ArrayList<>(2);
+ userIDPrefMap.put(userID, userPrefs);
+ }
+ userPrefs.add(new GenericPreference(userID, itemID, ratingValue));
+ if (user.containsKey("created_at")
+ && mongoTimestamp.compareTo(getDate(user.get("created_at"))) < 0) {
+ mongoTimestamp = getDate(user.get("created_at"));
+ }
+ }
+ }
+ }
+ delegate = new GenericDataModel(GenericDataModel.toDataMap(userIDPrefMap, true));
+ }
+
+ private void removeMongoUserItem(String userID, String itemID) {
+ String userId = fromLongToId(Long.parseLong(userID));
+ String itemId = fromLongToId(Long.parseLong(itemID));
+ if (isUserItemInDB(userId, itemId)) {
+ mongoTimestamp = new Date();
+ BasicDBObject query = new BasicDBObject();
+ query.put(mongoUserID, userIsObject ? new ObjectId(userId) : userId);
+ query.put(mongoItemID, itemIsObject ? new ObjectId(itemId) : itemId);
+ if (mongoFinalRemove) {
+ log.info(collection.remove(query).toString());
+ } else {
+ BasicDBObject update = new BasicDBObject();
+ update.put("$set", new BasicDBObject("deleted_at", mongoTimestamp));
+ log.info(collection.update(query, update).toString());
+ }
+ log.info("Removing userID: {} itemID: {}", userID, itemId);
+ }
+ }
+
+ private void addMongoUserItem(String userID, String itemID, String preferenceValue) {
+ String userId = fromLongToId(Long.parseLong(userID));
+ String itemId = fromLongToId(Long.parseLong(itemID));
+ if (!isUserItemInDB(userId, itemId)) {
+ mongoTimestamp = new Date();
+ BasicDBObject user = new BasicDBObject();
+ Object userIdObject = userIsObject ? new ObjectId(userId) : userId;
+ Object itemIdObject = itemIsObject ? new ObjectId(itemId) : itemId;
+ user.put(mongoUserID, userIdObject);
+ user.put(mongoItemID, itemIdObject);
+ user.put(mongoPreference, preferenceIsString ? preferenceValue : Double.parseDouble(preferenceValue));
+ user.put("created_at", mongoTimestamp);
+ collection.insert(user);
+ log.info("Adding userID: {} itemID: {} preferenceValue: {}", userID, itemID, preferenceValue);
+ }
+ }
+
+ private boolean isUserItemInDB(String userID, String itemID) {
+ BasicDBObject query = new BasicDBObject();
+ Object userId = userIsObject ? new ObjectId(userID) : userID;
+ Object itemId = itemIsObject ? new ObjectId(itemID) : itemID;
+ query.put(mongoUserID, userId);
+ query.put(mongoItemID, itemId);
+ return collection.findOne(query) != null;
+ }
+
+ private DataModel removeUserItem(long userID, Iterable<List<String>> items) {
+ FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();
+ for (List<String> item : items) {
+ PreferenceArray prefs = rawData.get(userID);
+ long itemID = Long.parseLong(item.get(0));
+ if (prefs != null) {
+ boolean exists = false;
+ int length = prefs.length();
+ for (int i = 0; i < length; i++) {
+ if (prefs.getItemID(i) == itemID) {
+ exists = true;
+ break;
+ }
+ }
+ if (exists) {
+ rawData.remove(userID);
+ if (length > 1) {
+ PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1);
+ for (int i = 0, j = 0; i < length; i++, j++) {
+ if (prefs.getItemID(i) == itemID) {
+ j--;
+ } else {
+ newPrefs.set(j, prefs.get(i));
+ }
+ }
+ rawData.put(userID, newPrefs);
+ }
+ log.info("Removing userID: {} itemID: {}", userID, itemID);
+ if (mongoManage) {
+ removeMongoUserItem(Long.toString(userID), Long.toString(itemID));
+ }
+ }
+ }
+ }
+ return new GenericDataModel(rawData);
+ }
+
+ private DataModel addUserItem(long userID, Iterable<List<String>> items) {
+ FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();
+ PreferenceArray prefs = rawData.get(userID);
+ for (List<String> item : items) {
+ long itemID = Long.parseLong(item.get(0));
+ float preferenceValue = Float.parseFloat(item.get(1));
+ boolean exists = false;
+ if (prefs != null) {
+ for (int i = 0; i < prefs.length(); i++) {
+ if (prefs.getItemID(i) == itemID) {
+ exists = true;
+ prefs.setValue(i, preferenceValue);
+ break;
+ }
+ }
+ }
+ if (!exists) {
+ if (prefs == null) {
+ prefs = new GenericUserPreferenceArray(1);
+ } else {
+ PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1);
+ for (int i = 0, j = 1; i < prefs.length(); i++, j++) {
+ newPrefs.set(j, prefs.get(i));
+ }
+ prefs = newPrefs;
+ }
+ prefs.setUserID(0, userID);
+ prefs.setItemID(0, itemID);
+ prefs.setValue(0, preferenceValue);
+ log.info("Adding userID: {} itemID: {} preferenceValue: {}", userID, itemID, preferenceValue);
+ rawData.put(userID, prefs);
+ if (mongoManage) {
+ addMongoUserItem(Long.toString(userID),
+ Long.toString(itemID),
+ Float.toString(preferenceValue));
+ }
+ }
+ }
+ return new GenericDataModel(rawData);
+ }
+
+ private Date getDate(Object date) {
+ if (date.getClass().getName().contains("Date")) {
+ return (Date) date;
+ }
+ if (date.getClass().getName().contains("String")) {
+ try {
+ synchronized (dateFormat) {
+ return dateFormat.parse(date.toString());
+ }
+ } catch (ParseException ioe) {
+ log.warn("Error parsing timestamp", ioe);
+ }
+ }
+ return new Date(0);
+ }
+
+ private float getPreference(Object value) {
+ if (value != null) {
+ if (value.getClass().getName().contains("String")) {
+ preferenceIsString = true;
+ return Float.parseFloat(value.toString());
+ } else {
+ preferenceIsString = false;
+ return Double.valueOf(value.toString()).floatValue();
+ }
+ } else {
+ return 0.5f;
+ }
+ }
+
+ private String getID(Object id, boolean isUser) {
+ if (id.getClass().getName().contains("ObjectId")) {
+ if (isUser) {
+ userIsObject = true;
+ } else {
+ itemIsObject = true;
+ }
+ return ((ObjectId) id).toStringMongod();
+ } else {
+ return id.toString();
+ }
+ }
+
+ private void checkData(String userID,
+ Iterable<List<String>> items,
+ boolean add) throws NoSuchUserException, NoSuchItemException {
+ Preconditions.checkNotNull(userID);
+ Preconditions.checkNotNull(items);
+ Preconditions.checkArgument(!userID.isEmpty(), "userID is empty");
+ for (List<String> item : items) {
+ Preconditions.checkNotNull(item.get(0));
+ Preconditions.checkArgument(!item.get(0).isEmpty(), "item is empty");
+ }
+ if (userIsObject && !ID_PATTERN.matcher(userID).matches()) {
+ throw new IllegalArgumentException();
+ }
+ for (List<String> item : items) {
+ if (itemIsObject && !ID_PATTERN.matcher(item.get(0)).matches()) {
+ throw new IllegalArgumentException();
+ }
+ }
+ if (!add && !isIDInModel(userID)) {
+ throw new NoSuchUserException();
+ }
+ for (List<String> item : items) {
+ if (!add && !isIDInModel(item.get(0))) {
+ throw new NoSuchItemException();
+ }
+ }
+ }
+
+ /**
+ * Cleanup mapping collection.
+ */
+ public void cleanupMappingCollection() {
+ collectionMap.drop();
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() throws TasteException {
+ return delegate.getUserIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long id) throws TasteException {
+ return delegate.getPreferencesFromUser(id);
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ return delegate.getItemIDsFromUser(userID);
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() throws TasteException {
+ return delegate.getItemIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ return delegate.getPreferencesForItem(itemID);
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ return delegate.getPreferenceValue(userID, itemID);
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ return delegate.getPreferenceTime(userID, itemID);
+ }
+
+ @Override
+ public int getNumItems() throws TasteException {
+ return delegate.getNumItems();
+ }
+
+ @Override
+ public int getNumUsers() throws TasteException {
+ return delegate.getNumUsers();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ return delegate.getNumUsersWithPreferenceFor(itemID);
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return delegate.hasPreferenceValues();
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return delegate.getMaxPreference();
+ }
+
+ @Override
+ public float getMinPreference() {
+ return delegate.getMinPreference();
+ }
+
+ @Override
+ public String toString() {
+ return "MongoDBDataModel";
+ }
+
+}

r***@apache.org

2018-06-27 14:52:19 UTC

Permalink

MAHOUT-2042 and MAHOUT-2045 Delete directories which were moved/no longer in use

Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/e0573de3
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/e0573de3
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/e0573de3

Branch: refs/heads/branch-0.14.0
Commit: e0573de33887e9d0909f1a5c34313680916b5aee
Parents: 0908c52
Author: Trevor a.k.a @rawkintrevo <***@gmail.com>
Authored: Wed Jun 27 09:51:06 2018 -0500
Committer: Trevor a.k.a @rawkintrevo <***@gmail.com>
Committed: Wed Jun 27 09:51:06 2018 -0500

----------------------------------------------------------------------
bin/mahout.bu | 395 ---
community/mahout-mr/conf/arff.vector.props | 9 +
community/mahout-mr/conf/canopy.props | 14 +
community/mahout-mr/conf/cat.props | 4 +
community/mahout-mr/conf/cleansvd.props | 3 +
community/mahout-mr/conf/clusterdump.props | 0
community/mahout-mr/conf/clusterpp.props | 3 +
.../mahout-mr/conf/driver.classes.default.props | 69 +
.../mahout-mr/conf/evaluateFactorization.props | 0
.../conf/evaluateFactorizationParallel.props | 0
community/mahout-mr/conf/fkmeans.props | 17 +
community/mahout-mr/conf/flink-config.yaml | 67 +
community/mahout-mr/conf/itemsimilarity.props | 9 +
community/mahout-mr/conf/kmeans.props | 13 +
community/mahout-mr/conf/log4j.xml | 15 +
community/mahout-mr/conf/lucene.vector.props | 0
community/mahout-mr/conf/matrixmult.props | 6 +
community/mahout-mr/conf/parallelALS.props | 0
.../conf/predictFromFactorization.props | 0
.../mahout-mr/conf/recommendfactorized.props | 0
.../mahout-mr/conf/recommenditembased.props | 14 +
community/mahout-mr/conf/rowid.props | 2 +
community/mahout-mr/conf/rowsimilarity.props | 8 +
community/mahout-mr/conf/runlogistic.props | 1 +
community/mahout-mr/conf/seq2sparse.props | 15 +
community/mahout-mr/conf/seqdirectory.props | 3 +
community/mahout-mr/conf/seqdumper.props | 0
community/mahout-mr/conf/seqwiki.props | 0
community/mahout-mr/conf/splitDataset.props | 0
community/mahout-mr/conf/ssvd.props | 14 +
community/mahout-mr/conf/svd.props | 6 +
community/mahout-mr/conf/trainlogistic.props | 2 +
community/mahout-mr/conf/transpose.props | 2 +
community/mahout-mr/conf/vectordump.props | 1 +
.../integration/bin/prep_asf_mail_archives.sh | 106 +
community/mahout-mr/integration/pom.xml | 198 ++
.../mahout/benchmark/BenchmarkRunner.java | 111 +
.../apache/mahout/benchmark/CloneBenchmark.java | 62 +
.../benchmark/ClosestCentroidBenchmark.java | 98 +
.../mahout/benchmark/DistanceBenchmark.java | 104 +
.../apache/mahout/benchmark/DotBenchmark.java | 191 ++
.../apache/mahout/benchmark/MinusBenchmark.java | 115 +
.../apache/mahout/benchmark/PlusBenchmark.java | 115 +
.../benchmark/SerializationBenchmark.java | 124 +
.../apache/mahout/benchmark/TimesBenchmark.java | 115 +
.../mahout/benchmark/VectorBenchmarks.java | 497 ++++
.../model/cassandra/CassandraDataModel.java | 465 ++++
.../taste/impl/model/hbase/HBaseDataModel.java | 497 ++++
.../jdbc/AbstractBooleanPrefJDBCDataModel.java | 137 ++
.../impl/model/jdbc/AbstractJDBCDataModel.java | 787 ++++++
.../model/jdbc/ConnectionPoolDataSource.java | 122 +
.../impl/model/jdbc/GenericJDBCDataModel.java | 146 ++
.../jdbc/MySQLBooleanPrefJDBCDataModel.java | 161 ++
.../impl/model/jdbc/MySQLJDBCDataModel.java | 247 ++
.../PostgreSQLBooleanPrefJDBCDataModel.java | 146 ++
.../model/jdbc/PostgreSQLJDBCDataModel.java | 172 ++
.../model/jdbc/ReloadFromJDBCDataModel.java | 178 ++
.../jdbc/SQL92BooleanPrefJDBCDataModel.java | 221 ++
.../impl/model/jdbc/SQL92JDBCDataModel.java | 248 ++
.../impl/model/mongodb/MongoDBDataModel.java | 873 +++++++
.../AbstractJDBCInMemoryItemSimilarity.java | 132 +
.../jdbc/AbstractJDBCItemSimilarity.java | 213 ++
.../jdbc/MySQLJDBCInMemoryItemSimilarity.java | 47 +
.../jdbc/MySQLJDBCItemSimilarity.java | 103 +
.../jdbc/SQL92JDBCInMemoryItemSimilarity.java | 51 +
.../jdbc/SQL92JDBCItemSimilarity.java | 57 +
.../mahout/cf/taste/web/RecommenderServlet.java | 215 ++
.../cf/taste/web/RecommenderSingleton.java | 57 +
.../mahout/cf/taste/web/RecommenderWrapper.java | 126 +
.../classifier/ConfusionMatrixDumper.java | 425 ++++
.../mahout/clustering/cdbw/CDbwEvaluator.java | 387 +++
.../clustering/conversion/InputDriver.java | 114 +
.../clustering/conversion/InputMapper.java | 81 +
.../clustering/evaluation/ClusterEvaluator.java | 196 ++
.../evaluation/RepresentativePointsDriver.java | 243 ++
.../evaluation/RepresentativePointsMapper.java | 117 +
.../evaluation/RepresentativePointsReducer.java | 70 +
.../mahout/clustering/lda/LDAPrintTopics.java | 229 ++
.../text/MailArchivesClusteringAnalyzer.java | 164 ++
.../text/MultipleTextFileInputFormat.java | 46 +
.../mahout/text/PrefixAdditionFilter.java | 67 +
.../mahout/text/SequenceFilesFromDirectory.java | 214 ++
.../text/SequenceFilesFromDirectoryFilter.java | 99 +
.../text/SequenceFilesFromDirectoryMapper.java | 61 +
.../text/SequenceFilesFromMailArchives.java | 369 +++
.../SequenceFilesFromMailArchivesMapper.java | 244 ++
.../mahout/text/TextParagraphSplittingJob.java | 73 +
.../mahout/text/WholeFileRecordReader.java | 125 +
.../mahout/text/WikipediaToSequenceFile.java | 210 ++
.../text/wikipedia/WikipediaAnalyzer.java | 49 +
.../WikipediaDatasetCreatorDriver.java | 190 ++
.../WikipediaDatasetCreatorMapper.java | 142 ++
.../WikipediaDatasetCreatorReducer.java | 38 +
.../mahout/text/wikipedia/WikipediaMapper.java | 179 ++
.../text/wikipedia/WikipediaXmlSplitter.java | 234 ++
.../mahout/text/wikipedia/XmlInputFormat.java | 164 ++
.../java/org/apache/mahout/utils/Bump125.java | 62 +
.../org/apache/mahout/utils/MatrixDumper.java | 138 ++
.../apache/mahout/utils/SequenceFileDumper.java | 168 ++
.../org/apache/mahout/utils/SplitInput.java | 673 +++++
.../org/apache/mahout/utils/SplitInputJob.java | 213 ++
.../utils/clustering/AbstractClusterWriter.java | 160 ++
.../utils/clustering/CSVClusterWriter.java | 69 +
.../mahout/utils/clustering/ClusterDumper.java | 328 +++
.../utils/clustering/ClusterDumperWriter.java | 100 +
.../mahout/utils/clustering/ClusterWriter.java | 53 +
.../utils/clustering/GraphMLClusterWriter.java | 216 ++
.../utils/clustering/JsonClusterWriter.java | 188 ++
.../apache/mahout/utils/email/MailOptions.java | 186 ++
.../mahout/utils/email/MailProcessor.java | 183 ++
.../apache/mahout/utils/io/ChunkedWrapper.java | 42 +
.../apache/mahout/utils/io/ChunkedWriter.java | 86 +
.../apache/mahout/utils/io/IOWriterWrapper.java | 45 +
.../apache/mahout/utils/io/WrappedWriter.java | 31 +
.../nlp/collocations/llr/BloomTokenFilter.java | 78 +
.../mahout/utils/regex/AnalyzerTransformer.java | 75 +
.../mahout/utils/regex/ChainTransformer.java | 55 +
.../apache/mahout/utils/regex/FPGFormatter.java | 34 +
.../mahout/utils/regex/IdentityFormatter.java | 26 +
.../mahout/utils/regex/IdentityTransformer.java | 30 +
.../utils/regex/RegexConverterDriver.java | 101 +
.../mahout/utils/regex/RegexFormatter.java | 24 +
.../apache/mahout/utils/regex/RegexMapper.java | 80 +
.../mahout/utils/regex/RegexTransformer.java | 27 +
.../apache/mahout/utils/regex/RegexUtils.java | 69 +
.../utils/regex/URLDecodeTransformer.java | 43 +
.../apache/mahout/utils/vectors/RowIdJob.java | 99 +
.../apache/mahout/utils/vectors/TermEntry.java | 46 +
.../apache/mahout/utils/vectors/TermInfo.java | 33 +
.../mahout/utils/vectors/VectorDumper.java | 266 ++
.../mahout/utils/vectors/VectorHelper.java | 256 ++
.../mahout/utils/vectors/arff/ARFFIterator.java | 144 ++
.../mahout/utils/vectors/arff/ARFFModel.java | 76 +
.../mahout/utils/vectors/arff/ARFFType.java | 62 +
.../utils/vectors/arff/ARFFVectorIterable.java | 155 ++
.../mahout/utils/vectors/arff/Driver.java | 263 ++
.../utils/vectors/arff/MapBackedARFFModel.java | 282 +++
.../utils/vectors/csv/CSVVectorIterator.java | 69 +
.../vectors/io/DelimitedTermInfoWriter.java | 73 +
.../vectors/io/SequenceFileVectorWriter.java | 75 +
.../mahout/utils/vectors/io/TermInfoWriter.java | 29 +
.../utils/vectors/io/TextualVectorWriter.java | 70 +
.../mahout/utils/vectors/io/VectorWriter.java | 52 +
.../vectors/lucene/AbstractLuceneIterator.java | 140 ++
.../utils/vectors/lucene/CachedTermInfo.java | 79 +
.../utils/vectors/lucene/ClusterLabels.java | 381 +++
.../mahout/utils/vectors/lucene/Driver.java | 349 +++
.../utils/vectors/lucene/LuceneIterable.java | 80 +
.../utils/vectors/lucene/LuceneIterator.java | 99 +
.../mahout/utils/vectors/lucene/TFDFMapper.java | 64 +
.../vectors/lucene/TermInfoClusterInOut.java | 81 +
.../MySQLJDBCInMemoryItemSimilarityTest.java | 79 +
.../mahout/clustering/TestClusterDumper.java | 236 ++
.../mahout/clustering/TestClusterEvaluator.java | 321 +++
.../clustering/cdbw/TestCDbwEvaluator.java | 326 +++
.../MailArchivesClusteringAnalyzerTest.java | 66 +
.../text/SequenceFilesFromMailArchivesTest.java | 240 ++
.../org/apache/mahout/text/TestPathFilter.java | 32 +
.../text/TestSequenceFilesFromDirectory.java | 313 +++
.../mahout/text/doc/MultipleFieldsDocument.java | 58 +
.../mahout/text/doc/NumericFieldDocument.java | 54 +
.../mahout/text/doc/SingleFieldDocument.java | 63 +
.../apache/mahout/text/doc/TestDocument.java | 29 +
.../mahout/text/doc/UnstoredFieldsDocument.java | 43 +
.../org/apache/mahout/utils/Bump125Test.java | 42 +
.../org/apache/mahout/utils/SplitInputTest.java | 418 ++++
.../mahout/utils/email/MailProcessorTest.java | 72 +
.../collocations/llr/BloomTokenFilterTest.java | 154 ++
.../mahout/utils/regex/RegexMapperTest.java | 104 +
.../mahout/utils/regex/RegexUtilsTest.java | 61 +
.../utils/vectors/RandomVectorIterable.java | 73 +
.../mahout/utils/vectors/VectorHelperTest.java | 140 ++
.../mahout/utils/vectors/arff/ARFFTypeTest.java | 35 +
.../vectors/arff/ARFFVectorIterableTest.java | 289 +++
.../mahout/utils/vectors/arff/DriverTest.java | 54 +
.../vectors/arff/MapBackedARFFModelTest.java | 60 +
.../vectors/csv/CSVVectorIteratorTest.java | 57 +
.../utils/vectors/io/VectorWriterTest.java | 67 +
.../vectors/lucene/CachedTermInfoTest.java | 121 +
.../mahout/utils/vectors/lucene/DriverTest.java | 136 ++
.../vectors/lucene/LuceneIterableTest.java | 195 ++
.../integration/src/test/resources/date.arff | 18 +
.../resources/expected-arff-dictionary-2.csv | 22 +
.../test/resources/expected-arff-dictionary.csv | 22 +
.../test/resources/expected-arff-schema-2.json | 1 +
.../test/resources/expected-arff-schema.json | 1 +
.../src/test/resources/non-numeric-1.arff | 24 +
.../src/test/resources/non-numeric-2.arff | 24 +
.../src/test/resources/quoted-id.arff | 9 +
.../src/test/resources/sample-dense.arff | 20 +
.../src/test/resources/sample-sparse.arff | 24 +
.../integration/src/test/resources/sample.arff | 11 +
.../integration/src/test/resources/test.mbox | 1038 ++++++++
.../appended-resources/supplemental-models.xml | 279 +++
.../src/images/logos/ mahout-powered.svg | 630 +++++
.../mahout-mr/src/images/logos/favicon.ico | Bin 0 -> 28838 bytes
.../mahout-mr/src/images/logos/favicon128.png | Bin 0 -> 5259 bytes
.../mahout-mr/src/images/logos/favicon16.png | Bin 0 -> 1009 bytes
.../mahout-mr/src/images/logos/favicon32.png | Bin 0 -> 1847 bytes
.../mahout-mr/src/images/logos/favicon64.png | Bin 0 -> 3148 bytes
.../src/images/logos/mahout-logo-100.png | Bin 0 -> 19477 bytes
.../src/images/logos/mahout-logo-200.png | Bin 0 -> 46360 bytes
.../src/images/logos/mahout-logo-300.png | Bin 0 -> 70139 bytes
.../src/images/logos/mahout-logo-400.png | Bin 0 -> 55468 bytes
.../images/logos/mahout-logo-poweredby-100.png | Bin 0 -> 24623 bytes
.../images/logos/mahout-logo-poweredby-55.png | Bin 0 -> 11684 bytes
.../logos/mahout-logo-transparent-400.png | Bin 0 -> 61970 bytes
.../mahout-mr/src/images/logos/mahout-logo.svg | 627 +++++
community/mahout-mr/src/main/assembly/src.xml | 64 +
distribution/pom.xml | 407 ----
.../src/main/assembly/scala-2.10_spark-1.6.xml | 249 --
.../src/main/assembly/scala-2.11_spark-2.0.xml | 249 --
.../src/main/assembly/scala-2.11_spark-2.1.xml | 249 --
distribution/src/main/assembly/src.xml | 64 -
hdfs/pom.xml | 246 --
.../java/org/apache/mahout/common/IOUtils.java | 194 --
.../org/apache/mahout/math/MatrixWritable.java | 202 --
.../org/apache/mahout/math/VarIntWritable.java | 86 -
.../org/apache/mahout/math/VarLongWritable.java | 83 -
.../java/org/apache/mahout/math/Varint.java | 167 --
.../org/apache/mahout/math/VectorWritable.java | 267 --
.../apache/mahout/math/MatrixWritableTest.java | 141 --
.../java/org/apache/mahout/math/VarintTest.java | 189 --
.../apache/mahout/math/VectorWritableTest.java | 116 -
integration/bin/prep_asf_mail_archives.sh | 106 -
integration/pom.xml | 198 --
.../mahout/benchmark/BenchmarkRunner.java | 111 -
.../apache/mahout/benchmark/CloneBenchmark.java | 62 -
.../benchmark/ClosestCentroidBenchmark.java | 98 -
.../mahout/benchmark/DistanceBenchmark.java | 104 -
.../apache/mahout/benchmark/DotBenchmark.java | 191 --
.../apache/mahout/benchmark/MinusBenchmark.java | 115 -
.../apache/mahout/benchmark/PlusBenchmark.java | 115 -
.../benchmark/SerializationBenchmark.java | 124 -
.../apache/mahout/benchmark/TimesBenchmark.java | 115 -
.../mahout/benchmark/VectorBenchmarks.java | 497 ----
.../model/cassandra/CassandraDataModel.java | 465 ----
.../taste/impl/model/hbase/HBaseDataModel.java | 497 ----
.../jdbc/AbstractBooleanPrefJDBCDataModel.java | 137 --
.../impl/model/jdbc/AbstractJDBCDataModel.java | 787 ------
.../model/jdbc/ConnectionPoolDataSource.java | 122 -
.../impl/model/jdbc/GenericJDBCDataModel.java | 146 --
.../jdbc/MySQLBooleanPrefJDBCDataModel.java | 161 --
.../impl/model/jdbc/MySQLJDBCDataModel.java | 247 --
.../PostgreSQLBooleanPrefJDBCDataModel.java | 146 --
.../model/jdbc/PostgreSQLJDBCDataModel.java | 172 --
.../model/jdbc/ReloadFromJDBCDataModel.java | 178 --
.../jdbc/SQL92BooleanPrefJDBCDataModel.java | 221 --
.../impl/model/jdbc/SQL92JDBCDataModel.java | 248 --
.../impl/model/mongodb/MongoDBDataModel.java | 873 -------
.../AbstractJDBCInMemoryItemSimilarity.java | 132 -
.../jdbc/AbstractJDBCItemSimilarity.java | 213 --
.../jdbc/MySQLJDBCInMemoryItemSimilarity.java | 47 -
.../jdbc/MySQLJDBCItemSimilarity.java | 103 -
.../jdbc/SQL92JDBCInMemoryItemSimilarity.java | 51 -
.../jdbc/SQL92JDBCItemSimilarity.java | 57 -
.../mahout/cf/taste/web/RecommenderServlet.java | 215 --
.../cf/taste/web/RecommenderSingleton.java | 57 -
.../mahout/cf/taste/web/RecommenderWrapper.java | 126 -
.../classifier/ConfusionMatrixDumper.java | 425 ----
.../mahout/clustering/cdbw/CDbwEvaluator.java | 387 ---
.../clustering/conversion/InputDriver.java | 114 -
.../clustering/conversion/InputMapper.java | 81 -
.../clustering/evaluation/ClusterEvaluator.java | 196 --
.../evaluation/RepresentativePointsDriver.java | 243 --
.../evaluation/RepresentativePointsMapper.java | 117 -
.../evaluation/RepresentativePointsReducer.java | 70 -
.../mahout/clustering/lda/LDAPrintTopics.java | 229 --
.../text/MailArchivesClusteringAnalyzer.java | 164 --
.../text/MultipleTextFileInputFormat.java | 46 -
.../mahout/text/PrefixAdditionFilter.java | 67 -
.../mahout/text/SequenceFilesFromDirectory.java | 214 --
.../text/SequenceFilesFromDirectoryFilter.java | 99 -
.../text/SequenceFilesFromDirectoryMapper.java | 61 -
.../text/SequenceFilesFromMailArchives.java | 369 ---
.../SequenceFilesFromMailArchivesMapper.java | 244 --
.../mahout/text/TextParagraphSplittingJob.java | 73 -
.../mahout/text/WholeFileRecordReader.java | 125 -
.../mahout/text/WikipediaToSequenceFile.java | 210 --
.../text/wikipedia/WikipediaAnalyzer.java | 49 -
.../WikipediaDatasetCreatorDriver.java | 190 --
.../WikipediaDatasetCreatorMapper.java | 142 --
.../WikipediaDatasetCreatorReducer.java | 38 -
.../mahout/text/wikipedia/WikipediaMapper.java | 179 --
.../text/wikipedia/WikipediaXmlSplitter.java | 234 --
.../mahout/text/wikipedia/XmlInputFormat.java | 164 --
.../java/org/apache/mahout/utils/Bump125.java | 62 -
.../org/apache/mahout/utils/MatrixDumper.java | 138 --
.../apache/mahout/utils/SequenceFileDumper.java | 168 --
.../org/apache/mahout/utils/SplitInput.java | 673 -----
.../org/apache/mahout/utils/SplitInputJob.java | 213 --
.../utils/clustering/AbstractClusterWriter.java | 160 --
.../utils/clustering/CSVClusterWriter.java | 69 -
.../mahout/utils/clustering/ClusterDumper.java | 328 ---
.../utils/clustering/ClusterDumperWriter.java | 100 -
.../mahout/utils/clustering/ClusterWriter.java | 53 -
.../utils/clustering/GraphMLClusterWriter.java | 216 --
.../utils/clustering/JsonClusterWriter.java | 188 --
.../apache/mahout/utils/email/MailOptions.java | 186 --
.../mahout/utils/email/MailProcessor.java | 183 --
.../apache/mahout/utils/io/ChunkedWrapper.java | 42 -
.../apache/mahout/utils/io/ChunkedWriter.java | 86 -
.../apache/mahout/utils/io/IOWriterWrapper.java | 45 -
.../apache/mahout/utils/io/WrappedWriter.java | 31 -
.../nlp/collocations/llr/BloomTokenFilter.java | 78 -
.../mahout/utils/regex/AnalyzerTransformer.java | 75 -
.../mahout/utils/regex/ChainTransformer.java | 55 -
.../apache/mahout/utils/regex/FPGFormatter.java | 34 -
.../mahout/utils/regex/IdentityFormatter.java | 26 -
.../mahout/utils/regex/IdentityTransformer.java | 30 -
.../utils/regex/RegexConverterDriver.java | 101 -
.../mahout/utils/regex/RegexFormatter.java | 24 -
.../apache/mahout/utils/regex/RegexMapper.java | 80 -
.../mahout/utils/regex/RegexTransformer.java | 27 -
.../apache/mahout/utils/regex/RegexUtils.java | 69 -
.../utils/regex/URLDecodeTransformer.java | 43 -
.../apache/mahout/utils/vectors/RowIdJob.java | 99 -
.../apache/mahout/utils/vectors/TermEntry.java | 46 -
.../apache/mahout/utils/vectors/TermInfo.java | 33 -
.../mahout/utils/vectors/VectorDumper.java | 266 --
.../mahout/utils/vectors/VectorHelper.java | 256 --
.../mahout/utils/vectors/arff/ARFFIterator.java | 144 --
.../mahout/utils/vectors/arff/ARFFModel.java | 76 -
.../mahout/utils/vectors/arff/ARFFType.java | 62 -
.../utils/vectors/arff/ARFFVectorIterable.java | 155 --
.../mahout/utils/vectors/arff/Driver.java | 263 --
.../utils/vectors/arff/MapBackedARFFModel.java | 282 ---
.../utils/vectors/csv/CSVVectorIterator.java | 69 -
.../vectors/io/DelimitedTermInfoWriter.java | 73 -
.../vectors/io/SequenceFileVectorWriter.java | 75 -
.../mahout/utils/vectors/io/TermInfoWriter.java | 29 -
.../utils/vectors/io/TextualVectorWriter.java | 70 -
.../mahout/utils/vectors/io/VectorWriter.java | 52 -
.../vectors/lucene/AbstractLuceneIterator.java | 140 --
.../utils/vectors/lucene/CachedTermInfo.java | 79 -
.../utils/vectors/lucene/ClusterLabels.java | 381 ---
.../mahout/utils/vectors/lucene/Driver.java | 349 ---
.../utils/vectors/lucene/LuceneIterable.java | 80 -
.../utils/vectors/lucene/LuceneIterator.java | 99 -
.../mahout/utils/vectors/lucene/TFDFMapper.java | 64 -
.../vectors/lucene/TermInfoClusterInOut.java | 81 -
.../MySQLJDBCInMemoryItemSimilarityTest.java | 79 -
.../mahout/clustering/TestClusterDumper.java | 236 --
.../mahout/clustering/TestClusterEvaluator.java | 321 ---
.../clustering/cdbw/TestCDbwEvaluator.java | 326 ---
.../MailArchivesClusteringAnalyzerTest.java | 66 -
.../text/SequenceFilesFromMailArchivesTest.java | 240 --
.../org/apache/mahout/text/TestPathFilter.java | 32 -
.../text/TestSequenceFilesFromDirectory.java | 313 ---
.../mahout/text/doc/MultipleFieldsDocument.java | 58 -
.../mahout/text/doc/NumericFieldDocument.java | 54 -
.../mahout/text/doc/SingleFieldDocument.java | 63 -
.../apache/mahout/text/doc/TestDocument.java | 29 -
.../mahout/text/doc/UnstoredFieldsDocument.java | 43 -
.../org/apache/mahout/utils/Bump125Test.java | 42 -
.../org/apache/mahout/utils/SplitInputTest.java | 418 ----
.../mahout/utils/email/MailProcessorTest.java | 72 -
.../collocations/llr/BloomTokenFilterTest.java | 154 --
.../mahout/utils/regex/RegexMapperTest.java | 104 -
.../mahout/utils/regex/RegexUtilsTest.java | 61 -
.../utils/vectors/RandomVectorIterable.java | 73 -
.../mahout/utils/vectors/VectorHelperTest.java | 140 --
.../mahout/utils/vectors/arff/ARFFTypeTest.java | 35 -
.../vectors/arff/ARFFVectorIterableTest.java | 289 ---
.../mahout/utils/vectors/arff/DriverTest.java | 54 -
.../vectors/arff/MapBackedARFFModelTest.java | 60 -
.../vectors/csv/CSVVectorIteratorTest.java | 57 -
.../utils/vectors/io/VectorWriterTest.java | 67 -
.../vectors/lucene/CachedTermInfoTest.java | 121 -
.../mahout/utils/vectors/lucene/DriverTest.java | 136 --
.../vectors/lucene/LuceneIterableTest.java | 195 --
integration/src/test/resources/date.arff | 18 -
.../resources/expected-arff-dictionary-2.csv | 22 -
.../test/resources/expected-arff-dictionary.csv | 22 -
.../test/resources/expected-arff-schema-2.json | 1 -
.../test/resources/expected-arff-schema.json | 1 -
.../src/test/resources/non-numeric-1.arff | 24 -
.../src/test/resources/non-numeric-2.arff | 24 -
integration/src/test/resources/quoted-id.arff | 9 -
.../src/test/resources/sample-dense.arff | 20 -
.../src/test/resources/sample-sparse.arff | 24 -
integration/src/test/resources/sample.arff | 11 -
integration/src/test/resources/test.mbox | 1038 --------
math-scala/pom.xml | 244 --
.../classifier/naivebayes/NBClassifier.scala | 119 -
.../mahout/classifier/naivebayes/NBModel.scala | 215 --
.../classifier/naivebayes/NaiveBayes.scala | 383 ---
.../classifier/stats/ClassifierStats.scala | 467 ----
.../classifier/stats/ConfusionMatrix.scala | 459 ----
.../common/io/GenericMatrixKryoSerializer.scala | 188 --
.../mahout/common/io/VectorKryoSerializer.scala | 248 --
.../apache/mahout/drivers/MahoutDriver.scala | 44 -
.../mahout/drivers/MahoutOptionParser.scala | 220 --
.../org/apache/mahout/logging/package.scala | 73 -
.../apache/mahout/math/algorithms/Fitter.scala | 27 -
.../apache/mahout/math/algorithms/Model.scala | 26 -
.../math/algorithms/SupervisedFitter.scala | 29 -
.../math/algorithms/SupervisedModel.scala | 26 -
.../math/algorithms/UnsupervisedFitter.scala | 28 -
.../math/algorithms/UnsupervisedModel.scala | 24 -
.../math/algorithms/clustering/Canopy.scala | 157 --
.../algorithms/clustering/ClusteringModel.scala | 45 -
.../common/distance/DistanceMetrics.scala | 48 -
.../algorithms/preprocessing/AsFactor.scala | 129 -
.../algorithms/preprocessing/MeanCenter.scala | 91 -
.../preprocessing/PreprocessorModel.scala | 58 -
.../preprocessing/StandardScaler.scala | 108 -
.../regression/CochraneOrcuttModel.scala | 151 --
.../regression/LinearRegressorModel.scala | 178 --
.../regression/OrdinaryLeastSquaresModel.scala | 71 -
.../algorithms/regression/RegressorModel.scala | 66 -
.../regression/tests/AutocorrelationTests.scala | 57 -
.../regression/tests/FittnessTests.scala | 133 -
.../apache/mahout/math/backend/Backend.scala | 33 -
.../mahout/math/backend/RootSolverFactory.scala | 84 -
.../mahout/math/backend/SolverFactory.scala | 55 -
.../mahout/math/backend/incore/package.scala | 17 -
.../mahout/math/backend/jvm/JvmBackend.scala | 51 -
.../mahout/math/cf/SimilarityAnalysis.scala | 453 ----
.../apache/mahout/math/decompositions/ALS.scala | 141 --
.../apache/mahout/math/decompositions/DQR.scala | 78 -
.../mahout/math/decompositions/DSPCA.scala | 162 --
.../mahout/math/decompositions/DSSVD.scala | 100 -
.../mahout/math/decompositions/SSVD.scala | 167 --
.../mahout/math/decompositions/package.scala | 141 --
.../org/apache/mahout/math/drm/BCast.scala | 24 -
.../org/apache/mahout/math/drm/CacheHint.scala | 36 -
.../mahout/math/drm/CheckpointedDrm.scala | 43 -
.../mahout/math/drm/CheckpointedOps.scala | 49 -
.../mahout/math/drm/DistributedContext.scala | 27 -
.../mahout/math/drm/DistributedEngine.scala | 268 --
.../mahout/math/drm/DrmDoubleScalarOps.scala | 37 -
.../org/apache/mahout/math/drm/DrmLike.scala | 60 -
.../org/apache/mahout/math/drm/DrmLikeOps.scala | 140 --
.../apache/mahout/math/drm/RLikeDrmOps.scala | 172 --
.../math/drm/logical/AbstractBinaryOp.scala | 44 -
.../math/drm/logical/AbstractUnaryOp.scala | 32 -
.../math/drm/logical/CheckpointAction.scala | 48 -
.../apache/mahout/math/drm/logical/OpAB.scala | 47 -
.../mahout/math/drm/logical/OpABAnyKey.scala | 48 -
.../apache/mahout/math/drm/logical/OpABt.scala | 48 -
.../apache/mahout/math/drm/logical/OpAewB.scala | 52 -
.../mahout/math/drm/logical/OpAewScalar.scala | 55 -
.../math/drm/logical/OpAewUnaryFunc.scala | 50 -
.../math/drm/logical/OpAewUnaryFuncFusion.scala | 67 -
.../apache/mahout/math/drm/logical/OpAt.scala | 43 -
.../apache/mahout/math/drm/logical/OpAtA.scala | 42 -
.../mahout/math/drm/logical/OpAtAnyKey.scala | 40 -
.../apache/mahout/math/drm/logical/OpAtB.scala | 48 -
.../apache/mahout/math/drm/logical/OpAtx.scala | 49 -
.../apache/mahout/math/drm/logical/OpAx.scala | 48 -
.../mahout/math/drm/logical/OpCbind.scala | 48 -
.../mahout/math/drm/logical/OpCbindScalar.scala | 42 -
.../mahout/math/drm/logical/OpMapBlock.scala | 48 -
.../apache/mahout/math/drm/logical/OpPar.scala | 23 -
.../mahout/math/drm/logical/OpRbind.scala | 46 -
.../mahout/math/drm/logical/OpRowRange.scala | 44 -
.../math/drm/logical/OpTimesLeftMatrix.scala | 51 -
.../math/drm/logical/OpTimesRightMatrix.scala | 51 -
.../mahout/math/drm/logical/TEwFunc.scala | 37 -
.../org/apache/mahout/math/drm/package.scala | 375 ---
.../mahout/math/indexeddataset/BiMap.scala | 128 -
.../math/indexeddataset/IndexedDataset.scala | 61 -
.../math/indexeddataset/ReaderWriter.scala | 117 -
.../mahout/math/indexeddataset/Schema.scala | 105 -
.../apache/mahout/math/scalabindings/MMul.scala | 295 ---
.../math/scalabindings/MahoutCollections.scala | 46 -
.../scalabindings/MatlabLikeMatrixOps.scala | 66 -
.../math/scalabindings/MatlabLikeOps.scala | 35 -
.../math/scalabindings/MatlabLikeTimesOps.scala | 28 -
.../scalabindings/MatlabLikeVectorOps.scala | 73 -
.../mahout/math/scalabindings/MatrixOps.scala | 332 ---
.../scalabindings/RLikeDoubleScalarOps.scala | 63 -
.../math/scalabindings/RLikeMatrixOps.scala | 172 --
.../mahout/math/scalabindings/RLikeOps.scala | 38 -
.../math/scalabindings/RLikeVectorOps.scala | 110 -
.../mahout/math/scalabindings/VectorOps.scala | 174 --
.../mahout/math/scalabindings/package.scala | 477 ----
.../org/apache/mahout/nlp/tfidf/TFIDF.scala | 112 -
.../org/apache/mahout/util/IOUtilsScala.scala | 64 -
.../classifier/naivebayes/NBTestBase.scala | 291 ---
.../stats/ClassifierStatsTestBase.scala | 257 --
.../math/algorithms/ClusteringSuiteBase.scala | 48 -
.../math/algorithms/PreprocessorSuiteBase.scala | 118 -
.../math/algorithms/RegressionSuiteBase.scala | 180 --
.../algorithms/RegressionTestsSuiteBase.scala | 126 -
.../mahout/math/backend/BackendSuite.scala | 59 -
.../decompositions/DecompositionsSuite.scala | 113 -
.../DistributedDecompositionsSuiteBase.scala | 219 --
.../mahout/math/drm/DrmLikeOpsSuiteBase.scala | 153 --
.../mahout/math/drm/DrmLikeSuiteBase.scala | 74 -
.../mahout/math/drm/RLikeDrmOpsSuiteBase.scala | 655 -----
.../scalabindings/MahoutCollectionsSuite.scala | 42 -
.../mahout/math/scalabindings/MathSuite.scala | 267 --
.../MatlabLikeMatrixOpsSuite.scala | 67 -
.../math/scalabindings/MatrixOpsSuite.scala | 228 --
.../scalabindings/RLikeMatrixOpsSuite.scala | 369 ---
.../scalabindings/RLikeVectorOpsSuite.scala | 72 -
.../math/scalabindings/VectorOpsSuite.scala | 110 -
.../apache/mahout/nlp/tfidf/TFIDFtestBase.scala | 184 --
.../mahout/test/DistributedMahoutSuite.scala | 28 -
.../mahout/test/LoggerConfiguration.scala | 16 -
.../org/apache/mahout/test/MahoutSuite.scala | 54 -
math/pom.xml | 256 --
.../math/buffer/ValueTypeBufferConsumer.java.t | 42 -
.../math/function/KeyTypeObjectProcedure.java.t | 50 -
.../math/function/KeyTypeProcedure.java.t | 46 -
.../function/KeyTypeValueTypeProcedure.java.t | 49 -
.../function/ObjectValueTypeProcedure.java.t | 49 -
.../math/function/ValueTypeComparator.java.t | 81 -
.../math/list/AbstractValueTypeList.java.t | 851 -------
.../mahout/math/list/ValueTypeArrayList.java.t | 659 -----
.../math/map/AbstractKeyTypeObjectMap.java.t | 467 ----
.../math/map/AbstractKeyTypeValueTypeMap.java.t | 509 ----
.../math/map/AbstractObjectValueTypeMap.java.t | 516 ----
.../math/map/OpenKeyTypeObjectHashMap.java.t | 548 -----
.../math/map/OpenKeyTypeValueTypeHashMap.java.t | 632 -----
.../math/map/OpenObjectValueTypeHashMap.java.t | 567 -----
.../mahout/math/set/AbstractKeyTypeSet.java.t | 181 --
.../mahout/math/set/OpenKeyTypeHashSet.java.t | 423 ----
.../apache/mahout/collections/Arithmetic.java | 489 ----
.../apache/mahout/collections/Constants.java | 75 -
.../org/apache/mahout/common/RandomUtils.java | 100 -
.../org/apache/mahout/common/RandomWrapper.java | 105 -
.../org/apache/mahout/math/AbstractMatrix.java | 834 -------
.../org/apache/mahout/math/AbstractVector.java | 684 ------
.../java/org/apache/mahout/math/Algebra.java | 73 -
.../java/org/apache/mahout/math/Arrays.java | 662 -----
.../org/apache/mahout/math/BinarySearch.java | 403 ---
.../mahout/math/CardinalityException.java | 30 -
.../java/org/apache/mahout/math/Centroid.java | 89 -
.../mahout/math/CholeskyDecomposition.java | 227 --
.../org/apache/mahout/math/ConstantVector.java | 177 --
.../apache/mahout/math/DelegatingVector.java | 336 ---
.../org/apache/mahout/math/DenseMatrix.java | 193 --
.../mahout/math/DenseSymmetricMatrix.java | 62 -
.../org/apache/mahout/math/DenseVector.java | 442 ----
.../org/apache/mahout/math/DiagonalMatrix.java | 378 ---
.../org/apache/mahout/math/FileBasedMatrix.java | 185 --
.../math/FileBasedSparseBinaryMatrix.java | 535 ----
.../mahout/math/FunctionalMatrixView.java | 99 -
.../org/apache/mahout/math/IndexException.java | 30 -
.../apache/mahout/math/LengthCachingVector.java | 35 -
.../java/org/apache/mahout/math/Matrices.java | 167 --
.../java/org/apache/mahout/math/Matrix.java | 413 ----
.../org/apache/mahout/math/MatrixSlice.java | 36 -
.../org/apache/mahout/math/MatrixTimesOps.java | 35 -
.../apache/mahout/math/MatrixVectorView.java | 292 ---
.../java/org/apache/mahout/math/MatrixView.java | 160 --
.../java/org/apache/mahout/math/MurmurHash.java | 158 --
.../org/apache/mahout/math/MurmurHash3.java | 84 -
.../org/apache/mahout/math/NamedVector.java | 328 ---
.../apache/mahout/math/OldQRDecomposition.java | 234 --
.../mahout/math/OrderedIntDoubleMapping.java | 265 --
.../mahout/math/OrthonormalityVerifier.java | 46 -
.../apache/mahout/math/PermutedVectorView.java | 250 --
.../apache/mahout/math/PersistentObject.java | 58 -
.../org/apache/mahout/math/PivotedMatrix.java | 288 ---
.../main/java/org/apache/mahout/math/QR.java | 27 -
.../org/apache/mahout/math/QRDecomposition.java | 181 --
.../mahout/math/RandomAccessSparseVector.java | 303 ---
.../apache/mahout/math/RandomTrinaryMatrix.java | 146 --
.../math/SequentialAccessSparseVector.java | 379 ---
.../mahout/math/SingularValueDecomposition.java | 669 -----
.../java/org/apache/mahout/math/Sorting.java | 2297 ------------------
.../apache/mahout/math/SparseColumnMatrix.java | 220 --
.../org/apache/mahout/math/SparseMatrix.java | 245 --
.../org/apache/mahout/math/SparseRowMatrix.java | 289 ---
.../java/org/apache/mahout/math/Swapper.java | 35 -
.../mahout/math/TransposedMatrixView.java | 147 --
.../org/apache/mahout/math/UpperTriangular.java | 160 --
.../java/org/apache/mahout/math/Vector.java | 434 ----
.../mahout/math/VectorBinaryAggregate.java | 481 ----
.../apache/mahout/math/VectorBinaryAssign.java | 667 -----
.../org/apache/mahout/math/VectorIterable.java | 56 -
.../java/org/apache/mahout/math/VectorView.java | 238 --
.../org/apache/mahout/math/WeightedVector.java | 87 -
.../mahout/math/WeightedVectorComparator.java | 54 -
.../math/als/AlternatingLeastSquaresSolver.java | 116 -
...itFeedbackAlternatingLeastSquaresSolver.java | 171 --
.../math/decomposer/AsyncEigenVerifier.java | 80 -
.../mahout/math/decomposer/EigenStatus.java | 50 -
.../math/decomposer/SimpleEigenVerifier.java | 41 -
.../math/decomposer/SingularVectorVerifier.java | 25 -
.../math/decomposer/hebbian/EigenUpdater.java | 25 -
.../math/decomposer/hebbian/HebbianSolver.java | 342 ---
.../math/decomposer/hebbian/HebbianUpdater.java | 71 -
.../math/decomposer/hebbian/TrainingState.java | 143 --
.../math/decomposer/lanczos/LanczosSolver.java | 213 --
.../math/decomposer/lanczos/LanczosState.java | 107 -
.../org/apache/mahout/math/flavor/BackEnum.java | 26 -
.../apache/mahout/math/flavor/MatrixFlavor.java | 82 -
.../math/flavor/TraversingStructureEnum.java | 48 -
.../math/function/DoubleDoubleFunction.java | 98 -
.../mahout/math/function/DoubleFunction.java | 48 -
.../mahout/math/function/FloatFunction.java | 36 -
.../apache/mahout/math/function/Functions.java | 1730 -------------
.../mahout/math/function/IntFunction.java | 41 -
.../math/function/IntIntDoubleFunction.java | 43 -
.../mahout/math/function/IntIntFunction.java | 25 -
.../org/apache/mahout/math/function/Mult.java | 71 -
.../math/function/ObjectObjectProcedure.java | 40 -
.../mahout/math/function/ObjectProcedure.java | 47 -
.../apache/mahout/math/function/PlusMult.java | 123 -
.../math/function/SquareRootFunction.java | 26 -
.../mahout/math/function/TimesFunction.java | 77 -
.../mahout/math/function/VectorFunction.java | 27 -
.../mahout/math/function/package-info.java | 4 -
.../apache/mahout/math/jet/math/Arithmetic.java | 328 ---
.../apache/mahout/math/jet/math/Constants.java | 49 -
.../apache/mahout/math/jet/math/Polynomial.java | 98 -
.../mahout/math/jet/math/package-info.java | 5 -
.../random/AbstractContinousDistribution.java | 51 -
.../random/AbstractDiscreteDistribution.java | 27 -
.../math/jet/random/AbstractDistribution.java | 87 -
.../mahout/math/jet/random/Exponential.java | 81 -
.../apache/mahout/math/jet/random/Gamma.java | 302 ---
.../math/jet/random/NegativeBinomial.java | 106 -
.../apache/mahout/math/jet/random/Normal.java | 110 -
.../apache/mahout/math/jet/random/Poisson.java | 296 ---
.../apache/mahout/math/jet/random/Uniform.java | 164 --
.../math/jet/random/engine/MersenneTwister.java | 275 ---
.../math/jet/random/engine/RandomEngine.java | 169 --
.../math/jet/random/engine/package-info.java | 7 -
.../math/jet/random/sampling/RandomSampler.java | 503 ----
.../org/apache/mahout/math/jet/stat/Gamma.java | 681 ------
.../mahout/math/jet/stat/Probability.java | 203 --
.../mahout/math/jet/stat/package-info.java | 5 -
.../apache/mahout/math/list/AbstractList.java | 247 --
.../mahout/math/list/AbstractObjectList.java | 80 -
.../mahout/math/list/ObjectArrayList.java | 419 ----
.../mahout/math/list/SimpleLongArrayList.java | 102 -
.../apache/mahout/math/list/package-info.java | 144 --
.../apache/mahout/math/map/HashFunctions.java | 115 -
.../org/apache/mahout/math/map/OpenHashMap.java | 652 -----
.../org/apache/mahout/math/map/PrimeFinder.java | 145 --
.../mahout/math/map/QuickOpenIntIntHashMap.java | 215 --
.../apache/mahout/math/map/package-info.java | 250 --
.../org/apache/mahout/math/package-info.java | 4 -
.../math/random/AbstractSamplerFunction.java | 39 -
.../mahout/math/random/ChineseRestaurant.java | 111 -
.../apache/mahout/math/random/Empirical.java | 124 -
.../apache/mahout/math/random/IndianBuffet.java | 157 --
.../org/apache/mahout/math/random/Missing.java | 59 -
.../apache/mahout/math/random/MultiNormal.java | 118 -
.../apache/mahout/math/random/Multinomial.java | 202 --
.../org/apache/mahout/math/random/Normal.java | 40 -
.../mahout/math/random/PoissonSampler.java | 67 -
.../org/apache/mahout/math/random/Sampler.java | 25 -
.../mahout/math/random/WeightedThing.java | 71 -
.../org/apache/mahout/math/set/AbstractSet.java | 188 --
.../org/apache/mahout/math/set/HashUtils.java | 56 -
.../org/apache/mahout/math/set/OpenHashSet.java | 548 -----
.../math/solver/ConjugateGradientSolver.java | 213 --
.../mahout/math/solver/EigenDecomposition.java | 892 -------
.../mahout/math/solver/JacobiConditioner.java | 47 -
.../org/apache/mahout/math/solver/LSMR.java | 565 -----
.../mahout/math/solver/Preconditioner.java | 36 -
.../mahout/math/ssvd/SequentialBigSvd.java | 69 -
.../apache/mahout/math/stats/LogLikelihood.java | 220 --
.../math/stats/OnlineExponentialAverage.java | 62 -
.../mahout/math/stats/OnlineSummarizer.java | 93 -
.../math/list/ValueTypeArrayListTest.java.t | 237 --
.../map/OpenKeyTypeObjectHashMapTest.java.t | 431 ----
.../map/OpenKeyTypeValueTypeHashMapTest.java.t | 379 ---
.../map/OpenObjectValueTypeHashMapTest.java.t | 423 ----
.../math/set/OpenKeyTypeHashSetTest.java.t | 179 --
.../apache/mahout/common/RandomUtilsTest.java | 81 -
.../apache/mahout/math/AbstractVectorTest.java | 658 -----
.../org/apache/mahout/math/CentroidTest.java | 72 -
.../mahout/math/CholeskyDecompositionTest.java | 152 --
.../apache/mahout/math/DenseSymmetricTest.java | 65 -
.../apache/mahout/math/DiagonalMatrixTest.java | 92 -
.../apache/mahout/math/FileBasedMatrixTest.java | 89 -
.../math/FileBasedSparseBinaryMatrixTest.java | 95 -
.../org/apache/mahout/math/FunctionTest.java | 133 -
.../org/apache/mahout/math/MahoutTestCase.java | 109 -
.../org/apache/mahout/math/MatricesTest.java | 123 -
.../java/org/apache/mahout/math/MatrixTest.java | 645 -----
.../mahout/math/MatrixVectorViewTest.java | 58 -
.../org/apache/mahout/math/MurmurHash3Test.java | 48 -
.../org/apache/mahout/math/MurmurHashTest.java | 120 -
.../mahout/math/OldQRDecompositionTest.java | 187 --
.../mahout/math/PermutedVectorViewTest.java | 105 -
.../apache/mahout/math/PivotedMatrixTest.java | 65 -
.../apache/mahout/math/QRDecompositionTest.java | 280 ---
.../org/apache/mahout/math/TestDenseMatrix.java | 45 -
.../org/apache/mahout/math/TestDenseVector.java | 47 -
.../org/apache/mahout/math/TestMatrixView.java | 470 ----
.../math/TestOrderedIntDoubleMapping.java | 104 -
.../math/TestRandomAccessSparseVector.java | 65 -
.../math/TestSequentialAccessSparseVector.java | 62 -
.../math/TestSingularValueDecomposition.java | 327 ---
.../mahout/math/TestSparseColumnMatrix.java | 37 -
.../apache/mahout/math/TestSparseMatrix.java | 101 -
.../apache/mahout/math/TestSparseRowMatrix.java | 180 --
.../org/apache/mahout/math/TestVectorView.java | 314 ---
.../apache/mahout/math/UpperTriangularTest.java | 54 -
.../math/VectorBinaryAggregateCostTest.java | 330 ---
.../mahout/math/VectorBinaryAggregateTest.java | 143 --
.../mahout/math/VectorBinaryAssignCostTest.java | 243 --
.../mahout/math/VectorBinaryAssignTest.java | 75 -
.../java/org/apache/mahout/math/VectorTest.java | 1135 ---------
.../apache/mahout/math/WeightedVectorTest.java | 88 -
.../als/AlternatingLeastSquaresSolverTest.java | 151 --
.../mahout/math/decomposer/SolverTest.java | 177 --
.../decomposer/hebbian/TestHebbianSolver.java | 207 --
.../decomposer/lanczos/TestLanczosSolver.java | 97 -
.../math/jet/random/DistributionChecks.java | 118 -
.../mahout/math/jet/random/ExponentialTest.java | 102 -
.../mahout/math/jet/random/GammaTest.java | 131 -
.../math/jet/random/NegativeBinomialTest.java | 60 -
.../mahout/math/jet/random/NormalTest.java | 71 -
.../jet/random/engine/MersenneTwisterTest.java | 704 ------
.../apache/mahout/math/jet/stat/GammaTest.java | 138 --
.../mahout/math/jet/stat/ProbabilityTest.java | 196 --
.../mahout/math/list/ObjectArrayListTest.java | 51 -
.../math/random/ChineseRestaurantTest.java | 158 --
.../mahout/math/random/EmpiricalTest.java | 78 -
.../mahout/math/random/IndianBuffetTest.java | 43 -
.../mahout/math/random/MultiNormalTest.java | 81 -
.../mahout/math/random/MultinomialTest.java | 269 --
.../apache/mahout/math/random/NormalTest.java | 62 -
.../mahout/math/random/PoissonSamplerTest.java | 56 -
.../mahout/math/randomized/RandomBlasting.java | 355 ---
.../apache/mahout/math/set/HashUtilsTest.java | 90 -
.../math/solver/EigenDecompositionTest.java | 120 -
.../org/apache/mahout/math/solver/LSMRTest.java | 105 -
.../solver/TestConjugateGradientSolver.java | 231 --
.../mahout/math/ssvd/SequentialBigSvdTest.java | 86 -
.../mahout/math/stats/LogLikelihoodTest.java | 197 --
.../stats/OnlineExponentialAverageTest.java | 69 -
.../mahout/math/stats/OnlineSummarizerTest.java | 108 -
math/src/test/resources/beta-test-data.csv | 1005 --------
math/src/test/resources/hanging-svd.tsv | 90 -
.../resources/negative-binomial-test-data.csv | 62 -
math/src/test/resources/words.txt | 1168 ---------
pom.xml.bu | 1252 ----------
refactor-readme.md | 90 -
spark/pom.xml | 271 ---
spark/src/main/assembly/dependency-reduced.xml | 51 -
.../classifier/naivebayes/SparkNaiveBayes.scala | 170 --
.../org/apache/mahout/common/DrmMetadata.scala | 73 -
.../apache/mahout/common/HDFSPathSearch.scala | 81 -
.../org/apache/mahout/common/HDFSUtil.scala | 28 -
.../apache/mahout/common/Hadoop2HDFSUtil.scala | 83 -
.../mahout/drivers/ItemSimilarityDriver.scala | 213 --
.../mahout/drivers/MahoutSparkDriver.scala | 103 -
.../drivers/MahoutSparkOptionParser.scala | 47 -
.../mahout/drivers/RowSimilarityDriver.scala | 148 --
.../apache/mahout/drivers/TestNBDriver.scala | 108 -
.../drivers/TextDelimitedReaderWriter.scala | 336 ---
.../apache/mahout/drivers/TrainNBDriver.scala | 111 -
.../sparkbindings/SparkDistributedContext.scala | 30 -
.../mahout/sparkbindings/SparkEngine.scala | 387 ---
.../apache/mahout/sparkbindings/blas/ABt.scala | 339 ---
.../apache/mahout/sparkbindings/blas/AewB.scala | 239 --
.../mahout/sparkbindings/blas/AinCoreB.scala | 63 -
.../apache/mahout/sparkbindings/blas/At.scala | 85 -
.../apache/mahout/sparkbindings/blas/AtA.scala | 271 ---
.../apache/mahout/sparkbindings/blas/AtB.scala | 358 ---
.../apache/mahout/sparkbindings/blas/Ax.scala | 63 -
.../mahout/sparkbindings/blas/CbindAB.scala | 126 -
.../mahout/sparkbindings/blas/DrmRddOps.scala | 43 -
.../mahout/sparkbindings/blas/MapBlock.scala | 43 -
.../apache/mahout/sparkbindings/blas/Par.scala | 56 -
.../mahout/sparkbindings/blas/RbindAB.scala | 50 -
.../mahout/sparkbindings/blas/Slicing.scala | 27 -
.../mahout/sparkbindings/blas/package.scala | 217 --
.../drm/CheckpointedDrmSpark.scala | 224 --
.../drm/CheckpointedDrmSparkOps.scala | 16 -
.../mahout/sparkbindings/drm/DrmRddInput.scala | 41 -
.../mahout/sparkbindings/drm/SparkBCast.scala | 27 -
.../mahout/sparkbindings/drm/package.scala | 112 -
.../indexeddataset/IndexedDatasetSpark.scala | 129 -
.../io/MahoutKryoRegistrator.scala | 76 -
.../io/UnsupportedSerializer.scala | 31 -
.../io/WritableKryoSerializer.scala | 47 -
.../apache/mahout/sparkbindings/package.scala | 299 ---
.../mahout/cf/SimilarityAnalysisSuite.scala | 447 ----
.../naivebayes/NBSparkTestSuite.scala | 159 --
.../stats/ClassifierStatsSparkTestSuite.scala | 26 -
.../drivers/ItemSimilarityDriverSuite.scala | 832 -------
.../drivers/RowSimilarityDriverSuite.scala | 139 --
.../TextDelimitedReaderWriterSuite.scala | 53 -
.../math/algorithms/ClusteringSuite.scala | 25 -
.../math/algorithms/PreprocessorSuite.scala | 24 -
.../math/algorithms/RegressionSuite.scala | 25 -
.../math/algorithms/RegressionTestsSuite.scala | 25 -
.../DistributedDecompositionsSuite.scala | 32 -
.../mahout/nlp/tfidf/TFIDFSparkTestSuite.scala | 25 -
.../sparkbindings/SparkBindingsSuite.scala | 52 -
.../mahout/sparkbindings/blas/BlasSuite.scala | 208 --
.../sparkbindings/drm/DrmLikeOpsSuite.scala | 57 -
.../mahout/sparkbindings/drm/DrmLikeSuite.scala | 162 --
.../sparkbindings/drm/RLikeDrmOpsSuite.scala | 179 --
.../mahout/sparkbindings/io/IOSuite.scala | 195 --
.../test/DistributedSparkSuite.scala | 83 -
.../test/LoggerConfiguration.scala | 30 -
src/conf/arff.vector.props | 9 -
src/conf/canopy.props | 14 -
src/conf/cat.props | 4 -
src/conf/cleansvd.props | 3 -
src/conf/clusterdump.props | 0
src/conf/clusterpp.props | 3 -
src/conf/driver.classes.default.props | 69 -
src/conf/evaluateFactorization.props | 0
src/conf/evaluateFactorizationParallel.props | 0
src/conf/fkmeans.props | 17 -
src/conf/flink-config.yaml | 67 -
src/conf/itemsimilarity.props | 9 -
src/conf/kmeans.props | 13 -
src/conf/log4j.xml | 15 -
src/conf/lucene.vector.props | 0
src/conf/matrixmult.props | 6 -
src/conf/parallelALS.props | 0
src/conf/predictFromFactorization.props | 0
src/conf/recommendfactorized.props | 0
src/conf/recommenditembased.props | 14 -
src/conf/rowid.props | 2 -
src/conf/rowsimilarity.props | 8 -
src/conf/runlogistic.props | 1 -
src/conf/seq2sparse.props | 15 -
src/conf/seqdirectory.props | 3 -
src/conf/seqdumper.props | 0
src/conf/seqwiki.props | 0
src/conf/splitDataset.props | 0
src/conf/ssvd.props | 14 -
src/conf/svd.props | 6 -
src/conf/trainlogistic.props | 2 -
src/conf/transpose.props | 2 -
src/conf/vectordump.props | 1 -
.../appended-resources/supplemental-models.xml | 279 ---
src/main/images/logos/ mahout-powered.svg | 630 -----
src/main/images/logos/favicon.ico | Bin 28838 -> 0 bytes
src/main/images/logos/favicon128.png | Bin 5259 -> 0 bytes
src/main/images/logos/favicon16.png | Bin 1009 -> 0 bytes
src/main/images/logos/favicon32.png | Bin 1847 -> 0 bytes
src/main/images/logos/favicon64.png | Bin 3148 -> 0 bytes
src/main/images/logos/mahout-logo-100.png | Bin 19477 -> 0 bytes
src/main/images/logos/mahout-logo-200.png | Bin 46360 -> 0 bytes
src/main/images/logos/mahout-logo-300.png | Bin 70139 -> 0 bytes
src/main/images/logos/mahout-logo-400.png | Bin 55468 -> 0 bytes
.../images/logos/mahout-logo-poweredby-100.png | Bin 24623 -> 0 bytes
.../images/logos/mahout-logo-poweredby-55.png | Bin 11684 -> 0 bytes
.../logos/mahout-logo-transparent-400.png | Bin 61970 -> 0 bytes
src/main/images/logos/mahout-logo.svg | 627 -----
847 files changed, 25722 insertions(+), 103784 deletions(-)
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/bin/mahout.bu
----------------------------------------------------------------------
diff --git a/bin/mahout.bu b/bin/mahout.bu
deleted file mode 100755
index 20f9c3d..0000000
--- a/bin/mahout.bu
+++ /dev/null
@@ -1,395 +0,0 @@
-#!/bin/bash
-#
-# The Mahout command script
-#
-# Environment Variables
-#
-# MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
-#
-# MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB.
-# Default is 4000.
-#
-# HADOOP_CONF_DIR The location of a hadoop config directory
-#
-# MAHOUT_OPTS Extra Java runtime options.
-#
-# MAHOUT_CONF_DIR The location of the program short-name to class name
-# mappings and the default properties files
-# defaults to "$MAHOUT_HOME/src/conf"
-#
-# MAHOUT_LOCAL set to anything other than an empty string to force
-# mahout to run locally even if
-# HADOOP_CONF_DIR and HADOOP_HOME are set
-#
-# MAHOUT_CORE set to anything other than an empty string to force
-# mahout to run in developer 'core' mode, just as if the
-# -core option was presented on the command-line
-# Command-line Options
-#
-# -core -core is used to switch into 'developer mode' when
-# running mahout locally. If specified, the classes
-# from the 'target/classes' directories in each project
-# are used. Otherwise classes will be retrieved from
-# jars in the binary release collection or *-job.jar files
-# found in build directories. When running on hadoop
-# the job files will always be used.
-
-#
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-cygwin=false
-case "`uname`" in
-CYGWIN*) cygwin=true;;
-esac
-
-# Check that mahout home is set, if not set it to one dir up.
-
-# resolve links - $0 may be a softlink
-THIS="$0"
-while [ -h "$THIS" ]; do
- ls=`ls -ld "$THIS"`
- link=`expr "$ls" : '.*-> $.*$$'`
- if expr "$link" : '.*/.*' > /dev/null; then
- THIS="$link"
- else
- THIS=`dirname "$THIS"`/"$link"
- fi
-done
-
-IS_CORE=0
-if [ "$1" == "-core" ] ; then
- IS_CORE=1
- shift
-fi
-
-if [ "$1" == "-spark" ]; then
- SPARK=1
- shift
-fi
-
-if [ "$1" == "spark-shell" ]; then
- SPARK=1
-fi
-
-if [ "$1" == "spark-itemsimilarity" ]; then
- SPARK=1
-fi
-
-if [ "$1" == "spark-rowsimilarity" ]; then
- SPARK=1
-fi
-
-if [ "$1" == "spark-trainnb" ]; then
- SPARK=1
-fi
-
-if [ "$1" == "spark-testnb" ]; then
- SPARK=1
-fi
-
-if [ "$MAHOUT_CORE" != "" ]; then
- IS_CORE=1
-fi
-
-if [ "$1" == "h2o-node" ]; then
- H2O=1
-fi
-
-# some directories
-THIS_DIR=`dirname "$THIS"`
-MAHOUT_HOME=`cd "$THIS_DIR/.." ; pwd`
-
-# some Java parameters
-if [ "$MAHOUT_JAVA_HOME" != "" ]; then
- #echo "run java in $MAHOUT_JAVA_HOME"
- JAVA_HOME=$MAHOUT_JAVA_HOME
-fi
-
-if [ "$JAVA_HOME" = "" ]; then
- echo "Error: JAVA_HOME is not set."
- exit 1
-fi
-
-JAVA=$JAVA_HOME/bin/java
-JAVA_HEAP_MAX=-Xmx4g
-
-# check envvars which might override default args
-if [ "$MAHOUT_HEAPSIZE" != "" ]; then
- #echo "run with heapsize $MAHOUT_HEAPSIZE"
- JAVA_HEAP_MAX="-Xmx""$MAHOUT_HEAPSIZE""m"
- #echo $JAVA_HEAP_MAX
-fi
-
-if [ "x$MAHOUT_CONF_DIR" = "x" ]; then
- if [ -d $MAHOUT_HOME/src/conf ]; then
- MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf
- else
- if [ -d $MAHOUT_HOME/conf ]; then
- MAHOUT_CONF_DIR=$MAHOUT_HOME/conf
- else
- echo No MAHOUT_CONF_DIR found
- fi
- fi
-fi
-
-
-# CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf
-CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR
-
-if [ "$MAHOUT_LOCAL" != "" ]; then
- echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath."
-elif [ -n "$HADOOP_CONF_DIR" ] ; then
- echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath."
- CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR
-fi
-
-CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
-
-# so that filenames w/ spaces are handled correctly in loops below
-IFS=
-
-if [ $IS_CORE == 0 ]
-then
- # add release dependencies to CLASSPATH
- for f in $MAHOUT_HOME/lib/*.jar; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- if [ "$SPARK" != "1" ]; then
- if [$SPARK_HOME == ""]; then
- echo "Have you set SPARK_HOME ?"
- fi
- # add dev targets if they exist
- for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
- fi
-
- # add scala dev target
- for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- if [ "$H2O" == "1" ]; then
- for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- for f in $MAHOUT_HOME/h2o/target/mahout-h2o*.jar; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- fi
-
- # add jars for running from the command line if we requested shell or spark CLI driver
- if [ "$SPARK" == "1" ]; then
-
- for f in $MAHOUT_HOME/lib/mahout-hdfs-*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- for f in $MAHOUT_HOME/lib/mahout-core-*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- for f in $MAHOUT_HOME/lib/spark_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- for f in $MAHOUT_HOME/lib/spark-cli_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
-
- SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh"
- if [ -x "${SPARK_CP_BIN}" ]; then
- SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null)
- CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}"
- else
- echo "Cannot find Spark classpath. Is 'SPARK_HOME' set?"
- exit -1
- fi
-
- SPARK_ASSEMBLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh"
- if [ -x "${SPARK_ASSEMBLY_BIN}" ]; then
- SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEMBLY_BIN}" 2>/dev/null)
- CLASSPATH="${CLASSPATH}:${SPARK_ASSEMBLY_BIN}"
- else
- echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?"
- exit -1
- fi
- fi
-
- # add vcl jars at any point.
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # add release dependencies to CLASSPATH
- for f in $MAHOUT_HOME/lib/*.jar; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-else
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes
-fi
-
-# add development dependencies to CLASSPATH
-if [ "$SPARK" != "1" ]; then
- for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-fi
-
-
-# cygwin path translation
-if $cygwin; then
- CLASSPATH=`cygpath -p -w "$CLASSPATH"`
-fi
-
-# restore ordinary behaviour
-unset IFS
-JARS=$(echo "$MAHOUT_HOME"/*.jar | tr ' ' ',')
-case "$1" in
- (spark-shell)
- save_stty=$(stty -g 2>/dev/null);
- $SPARK_HOME/bin/spark-shell --jars "$JARS" -i $MAHOUT_HOME/bin/load-shell.scala --conf spark.kryo.referenceTracking=false --conf spark.kryo.registrator=org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator --conf spark.kryoserializer.buffer=32k --conf spark.kryoserializer.buffer.max=600m --conf spark.serializer=org.apache.spark.serializer.KryoSerializer $@
- stty sane; stty $save_stty
- ;;
- # Spark CLI drivers go here
- (spark-itemsimilarity)
- shift
- "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@"
- ;;
- (spark-rowsimilarity)
- shift
- "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.RowSimilarityDriver" "$@"
- ;;
- (spark-trainnb)
- shift
- "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TrainNBDriver" "$@"
- ;;
- (spark-testnb)
- shift
- "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TestNBDriver" "$@"
- ;;
-
- (h2o-node)
- shift
- "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "water.H2O" -md5skip "$@" -name mah2out
- ;;
- (*)
-
- # default log directory & file
- if [ "$MAHOUT_LOG_DIR" = "" ]; then
- MAHOUT_LOG_DIR="$MAHOUT_HOME/logs"
- fi
- if [ "$MAHOUT_LOGFILE" = "" ]; then
- MAHOUT_LOGFILE='mahout.log'
- fi
-
- #Fix log path under cygwin
- if $cygwin; then
- MAHOUT_LOG_DIR=`cygpath -p -w "$MAHOUT_LOG_DIR"`
- fi
-
- MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR"
- MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE"
-
-
- if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
- MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
- fi
-
- CLASS=org.apache.mahout.driver.MahoutDriver
-
- for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
- if [ -e "$f" ]; then
- MAHOUT_JOB=$f
- fi
- done
-
- # run it
-
- HADOOP_BINARY=$(PATH="${HADOOP_HOME:-${HADOOP_PREFIX}}/bin:$PATH" which hadoop 2>/dev/null)
- if [ -x "$HADOOP_BINARY" ] ; then
- HADOOP_BINARY_CLASSPATH=$("$HADOOP_BINARY" classpath)
- fi
- if [ ! -x "$HADOOP_BINARY" ] || [ "$MAHOUT_LOCAL" != "" ] ; then
- if [ ! -x "$HADOOP_BINARY" ] ; then
- echo "hadoop binary is not in PATH,HADOOP_HOME/bin,HADOOP_PREFIX/bin, running locally"
- elif [ "$MAHOUT_LOCAL" != "" ] ; then
- echo "MAHOUT_LOCAL is set, running locally"
- fi
- CLASSPATH="${CLASSPATH}:${MAHOUT_HOME}/lib/hadoop/*"
- case $1 in
- (classpath)
- echo $CLASSPATH
- ;;
- (*)
- exec "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" $CLASS "$@"
- esac
- else
- echo "Running on hadoop, using $HADOOP_BINARY and HADOOP_CONF_DIR=$HADOOP_CONF_DIR"
-
- if [ "$MAHOUT_JOB" = "" ] ; then
- echo "ERROR: Could not find mahout-examples-*.job in $MAHOUT_HOME or $MAHOUT_HOME/examples/target, please run 'mvn install' to create the .job file"
- exit 1
- else
- case "$1" in
- (hadoop)
- shift
- export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}:$CLASSPATH
- exec "$HADOOP_BINARY" "$@"
- ;;
- (classpath)
- echo $CLASSPATH
- ;;
- (*)
- echo "MAHOUT-JOB: $MAHOUT_JOB"
- export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}
- exec "$HADOOP_BINARY" jar $MAHOUT_JOB $CLASS "$@"
- esac
- fi
- fi
- ;;
-esac
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/arff.vector.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/arff.vector.props b/community/mahout-mr/conf/arff.vector.props
new file mode 100644
index 0000000..c8faebf
--- /dev/null
+++ b/community/mahout-mr/conf/arff.vector.props
@@ -0,0 +1,9 @@
+# The following parameters must be specified
+#d|input = /path/to/input
+#o|output = /path/to/output
+#t|dictOut = /path/to/dictionaryFileOrDirectory
+
+# The following parameters all have default values if not specified
+#m|max = <Max number of vectors to output. Defaults to Long.MAX_VALUE>
+#e|outputWriter <Defaults to 'seq' for SequenceFileVectorWriter>
+#l|delimiter <Delimiter for outputing the dictionary. Defaults to '\t'>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/canopy.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/canopy.props b/community/mahout-mr/conf/canopy.props
new file mode 100644
index 0000000..f79f1e9
--- /dev/null
+++ b/community/mahout-mr/conf/canopy.props
@@ -0,0 +1,14 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+#t1|t1 = <T1 threshold value>
+#t2|t2 = <T2 threshold value>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#cl|clustering = <cluster points if present>
+#dm|distance = <distance measure class name. Default: SquaredEuclideanDistanceMeasure>
+
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/cat.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/cat.props b/community/mahout-mr/conf/cat.props
new file mode 100644
index 0000000..6b1ddb1
--- /dev/null
+++ b/community/mahout-mr/conf/cat.props
@@ -0,0 +1,4 @@
+#lambda|lambda =
+#passes|passes =
+#lambda|lambda =
+#passes|passes =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/cleansvd.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/cleansvd.props b/community/mahout-mr/conf/cleansvd.props
new file mode 100644
index 0000000..0c4e804
--- /dev/null
+++ b/community/mahout-mr/conf/cleansvd.props
@@ -0,0 +1,3 @@
+#ci|corpusInput =
+#ei|eigenInput =
+#o|output =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/clusterdump.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/clusterdump.props b/community/mahout-mr/conf/clusterdump.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/clusterpp.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/clusterpp.props b/community/mahout-mr/conf/clusterpp.props
new file mode 100644
index 0000000..5b96a89
--- /dev/null
+++ b/community/mahout-mr/conf/clusterpp.props
@@ -0,0 +1,3 @@
+# The following parameters must be specified
+#i|input = /path/to/initial/cluster/output
+#o|output = /path/to/output

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/driver.classes.default.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/driver.classes.default.props b/community/mahout-mr/conf/driver.classes.default.props
new file mode 100644
index 0000000..cb37efb
--- /dev/null
+++ b/community/mahout-mr/conf/driver.classes.default.props
@@ -0,0 +1,69 @@
+#Utils
+org.apache.mahout.utils.vectors.VectorDumper = vectordump : Dump vectors from a sequence file to text
+org.apache.mahout.utils.clustering.ClusterDumper = clusterdump : Dump cluster output to text
+org.apache.mahout.utils.SequenceFileDumper = seqdumper : Generic Sequence File dumper
+org.apache.mahout.utils.vectors.lucene.Driver = lucene.vector : Generate Vectors from a Lucene index
+org.apache.mahout.utils.vectors.arff.Driver = arff.vector : Generate Vectors from an ARFF file or directory
+org.apache.mahout.utils.vectors.RowIdJob = rowid : Map SequenceFile<Text,VectorWritable> to {SequenceFile<IntWritable,VectorWritable>, SequenceFile<IntWritable,Text>}
+org.apache.mahout.utils.SplitInput = split : Split Input data into test and train sets
+org.apache.mahout.utils.MatrixDumper = matrixdump : Dump matrix in CSV format
+org.apache.mahout.utils.regex.RegexConverterDriver = regexconverter : Convert text files on a per line basis based on regular expressions
+org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate sequence files (of Text) from a directory
+org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles = seq2sparse: Sparse Vector generation from Text sequence files
+org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded: Encoded Sparse Vector generation from Text sequence files
+org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file
+org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives
+org.apache.mahout.clustering.streaming.tools.ResplitSequenceFiles = resplit : Splits a set of SequenceFiles into a number of equal splits
+org.apache.mahout.clustering.streaming.tools.ClusterQualitySummarizer = qualcluster : Runs clustering experiments and summarizes results in a CSV
+org.apache.mahout.classifier.df.tools.Describe = describe : Describe the fields and target variable in a data set
+
+#Math
+org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix
+org.apache.mahout.math.hadoop.MatrixMultiplicationJob = matrixmult : Take the product of two matrices
+org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver = svd : Lanczos Singular Value Decomposition
+org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob = cleansvd : Cleanup and verification of SVD output
+org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob = rowsimilarity : Compute the pairwise similarities of the rows of a matrix
+org.apache.mahout.math.hadoop.similarity.VectorDistanceSimilarityJob = vecdist : Compute the distances between a set of Vectors (or Cluster or Canopy, they must fit in memory) and a list of Vectors
+org.apache.mahout.math.hadoop.stochasticsvd.SSVDCli = ssvd : Stochastic SVD
+
+#Clustering
+org.apache.mahout.clustering.kmeans.KMeansDriver = kmeans : K-means clustering
+org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver = fkmeans : Fuzzy K-means clustering
+org.apache.mahout.clustering.lda.cvb.CVB0Driver = cvb : LDA via Collapsed Variation Bayes (0th deriv. approx)
+org.apache.mahout.clustering.lda.cvb.InMemoryCollapsedVariationalBayes0 = cvb0_local : LDA via Collapsed Variation Bayes, in memory locally.
+org.apache.mahout.clustering.canopy.CanopyDriver = canopy : Canopy clustering
+org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver = spectralkmeans : Spectral k-means clustering
+org.apache.mahout.clustering.topdown.postprocessor.ClusterOutputPostProcessorDriver = clusterpp : Groups Clustering Output In Clusters
+org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansDriver = streamingkmeans : Streaming k-means clustering
+
+#Classification
+#new bayes
+org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob = trainnb : Train the Vector-based Bayes classifier
+org.apache.mahout.classifier.naivebayes.test.TestNaiveBayesDriver = testnb : Test the Vector-based Bayes classifier
+
+#SGD
+org.apache.mahout.classifier.sgd.TrainLogistic = trainlogistic : Train a logistic regression using stochastic gradient descent
+org.apache.mahout.classifier.sgd.RunLogistic = runlogistic : Run a logistic regression model against CSV data
+org.apache.mahout.classifier.sgd.PrintResourceOrFile = cat : Print a file or resource as the logistic regression models would see it
+org.apache.mahout.classifier.sgd.TrainAdaptiveLogistic = trainAdaptiveLogistic : Train an AdaptivelogisticRegression model
+org.apache.mahout.classifier.sgd.ValidateAdaptiveLogistic = validateAdaptiveLogistic : Validate an AdaptivelogisticRegression model against hold-out data set
+org.apache.mahout.classifier.sgd.RunAdaptiveLogistic = runAdaptiveLogistic : Score new production data using a probably trained and validated AdaptivelogisticRegression model
+#HMM
+org.apache.mahout.classifier.sequencelearning.hmm.BaumWelchTrainer = baumwelch : Baum-Welch algorithm for unsupervised HMM training
+org.apache.mahout.classifier.sequencelearning.hmm.ViterbiEvaluator = viterbi : Viterbi decoding of hidden states from given output states sequence
+org.apache.mahout.classifier.sequencelearning.hmm.RandomSequenceGenerator = hmmpredict : Generate random sequence of observations by given HMM
+#Classifier Utils
+org.apache.mahout.classifier.ConfusionMatrixDumper = cmdump : Dump confusion matrix in HTML or text formats
+
+#Recommenders
+org.apache.mahout.cf.taste.hadoop.als.DatasetSplitter = splitDataset : split a rating dataset into training and probe parts
+org.apache.mahout.cf.taste.hadoop.als.FactorizationEvaluator = evaluateFactorization : compute RMSE and MAE of a rating matrix factorization against probes
+org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob = itemsimilarity : Compute the item-item-similarities for item-based collaborative filtering
+org.apache.mahout.cf.taste.hadoop.item.RecommenderJob = recommenditembased : Compute recommendations using item-based collaborative filtering
+org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob = parallelALS : ALS-WR factorization of a rating matrix
+org.apache.mahout.cf.taste.hadoop.als.RecommenderJob = recommendfactorized : Compute recommendations using the factorization of a rating matrix
+prepare20newsgroups = deprecated : Try the new vector backed naivebayes classifier see examples/bin/classify-20newsgroups.sh
+trainclassifier = deprecated : Try the new vector backed naivebayes classifier see examples/bin/classify-20newsgroups.sh
+testclassifier = deprecated : Try the new vector backed naivebayes classifier see examples/bin/classify-20newsgroups.sh
+lda = deprecated : Try the new Collapsed Variation Bayes LDA, try bin/mahout cvb or bin/mahout cvb0_local
+ldatopics = deprecated : Try the new Collapsed Variation Bayes LDA, try bin/mahout cvb or bin/mahout cvb0_local

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/evaluateFactorization.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/evaluateFactorization.props b/community/mahout-mr/conf/evaluateFactorization.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/evaluateFactorizationParallel.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/evaluateFactorizationParallel.props b/community/mahout-mr/conf/evaluateFactorizationParallel.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/fkmeans.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/fkmeans.props b/community/mahout-mr/conf/fkmeans.props
new file mode 100644
index 0000000..ad994d6
--- /dev/null
+++ b/community/mahout-mr/conf/fkmeans.props
@@ -0,0 +1,17 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#c|clusters = /path/to/initial/clusters
+#o|output = /path/to/output
+#m|m = <the coefficient normalization factor > 1.0>
+#x|max = <the maximum number of iterations to attempt>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#dm|distance = <distance measure class name. Default: SquaredEuclideanDistanceMeasure>
+#cd|convergenceDelta = <the convergence threshold. Default: 0.5>
+#u|numMap <the number of mapper tasks to launch. Default: 10>
+#r|numReduce = <the number of reduce tasks to launch. Default: 2>
+#cl|clustering = <cluster points if present>
+#e|emitMostLikely = <emit most likely cluster if clustering. Default: true>
+#t|threshold = <threshold if clustering and not emitMostLikely. Default: 0.0>
+#rs|randomSeed =<Value to seed RNG with if set>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/flink-config.yaml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/flink-config.yaml b/community/mahout-mr/conf/flink-config.yaml
new file mode 100644
index 0000000..968cb04
--- /dev/null
+++ b/community/mahout-mr/conf/flink-config.yaml
@@ -0,0 +1,67 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+
+#==============================================================================
+# Common
+#==============================================================================
+
+# The number of task slots that each TaskManager offers. Each slot runs one parallel pipeline.
+
+taskmanager.numberOfTaskSlots: 1
+
+# The parallelism used for programs that did not specify and other parallelism.
+
+parallelism.default: 4
+
+#==============================================================================
+# Advanced
+#==============================================================================
+
+# The number of buffers for the network stack.
+#
+# taskmanager.network.numberOfBuffers: 2048
+
+
+# Directories for temporary files.
+#
+# Add a delimited list for multiple directories, using the system directory
+# delimiter (colon ':' on unix) or a comma, e.g.:
+# /data1/tmp:/data2/tmp:/data3/tmp
+#
+# Note: Each directory entry is read from and written to by a different I/O
+# thread. You can include the same directory multiple times in order to create
+# multiple I/O threads against that directory. This is for example relevant for
+# high-throughput RAIDs.
+#
+# If not specified, the system-specific Java temporary directory (java.io.tmpdir
+# property) is taken.
+#
+taskmanager.tmp.dirs: /tmp/mahoutcache
+
+
+# Path to the Hadoop configuration directory.
+#
+# This configuration is used when writing into HDFS. Unless specified otherwise,
+# HDFS file creation will use HDFS default settings with respect to block-size,
+# replication factor, etc.
+#
+# You can also directly specify the paths to hdfs-default.xml and hdfs-site.xml
+# via keys 'fs.hdfs.hdfsdefault' and 'fs.hdfs.hdfssite'.
+#
+# fs.hdfs.hadoopconf: /path/to/hadoop/conf/

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/itemsimilarity.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/itemsimilarity.props b/community/mahout-mr/conf/itemsimilarity.props
new file mode 100644
index 0000000..fdc3322
--- /dev/null
+++ b/community/mahout-mr/conf/itemsimilarity.props
@@ -0,0 +1,9 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+#s|similarityClassname = <Name of distributed similarity class to instantiate>
+
+# The following parameters all have default values if not specified
+#m|maxSimilaritiesPerItem = <try to cap the number of similar items per item to this number. Default: 100>
+#mo|maxCooccurrencesPerItem = <try to cap the number of cooccurrences per item to this number. Default: 100>
+#b|booleanData = <Treat input as without pref values. Default: false>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/kmeans.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/kmeans.props b/community/mahout-mr/conf/kmeans.props
new file mode 100644
index 0000000..1b54e80
--- /dev/null
+++ b/community/mahout-mr/conf/kmeans.props
@@ -0,0 +1,13 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#c|clusters = /path/to/initial/clusters
+#o|output = /path/to/output
+#x|max = <the maximum number of iterations to attempt>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#cl|clustering = <cluster points if present>
+#dm|distance = <distance measure class name. Default: SquaredEuclideanDistanceMeasure>
+#cd|convergenceDelta = <the convergence threshold. Default: 0.5>
+#r|numReduce = <the number of reduce tasks to launch. Default: 1>
+#rs|randomSeed =<Value to seed RNG with if set>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/log4j.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/log4j.xml b/community/mahout-mr/conf/log4j.xml
new file mode 100644
index 0000000..6231b48
--- /dev/null
+++ b/community/mahout-mr/conf/log4j.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
+<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
+ <appender class="org.apache.log4j.ConsoleAppender" name="console">
+ <param value="System.err" name="target"/>
+ <layout class="org.apache.log4j.PatternLayout">
+ <param value="%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n" name="ConversionPattern"/>
+ </layout>
+ </appender>
+ 
+ <root>
+ <level value="info"/>
+ <appender-ref ref="console"/>
+ </root>
+</log4j:configuration>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/lucene.vector.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/lucene.vector.props b/community/mahout-mr/conf/lucene.vector.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/matrixmult.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/matrixmult.props b/community/mahout-mr/conf/matrixmult.props
new file mode 100644
index 0000000..95218b3
--- /dev/null
+++ b/community/mahout-mr/conf/matrixmult.props
@@ -0,0 +1,6 @@
+#nra|numRowsA =
+#nca|numColsA =
+#nrb|numRowsB =
+#ncb|numColsB =
+#ia|inputPathA =
+#ib|inputPathB =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/parallelALS.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/parallelALS.props b/community/mahout-mr/conf/parallelALS.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/predictFromFactorization.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/predictFromFactorization.props b/community/mahout-mr/conf/predictFromFactorization.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/recommendfactorized.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/recommendfactorized.props b/community/mahout-mr/conf/recommendfactorized.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/recommenditembased.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/recommenditembased.props b/community/mahout-mr/conf/recommenditembased.props
new file mode 100644
index 0000000..68375d9
--- /dev/null
+++ b/community/mahout-mr/conf/recommenditembased.props
@@ -0,0 +1,14 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+
+# The following parameters all have default values if not specified
+#n|numRecommendations = <Number of recommendations per user. Default: 10>
+#u|usersFile = <File of users to recommend for. Default: null>
+#i|itemsFile = <File of items to recommend for. Default: null>
+#f|filterFile = <File containing comma-separated userID,itemID pairs. Used to exclude the item from the recommendations for that user. Default: null>
+#b|booleanData = <Treat input as without pref values. Default: false>
+#mp|maxPrefsPerUser = <Maximum number of preferences considered per user in final recommendation phase. Default: 10>
+#m|maxSimilaritiesPerItem = <Maximum number of similarities considered per item. Default: 100>
+#mo|maxCooccurrencesPerItem = <try to cap the number of cooccurrences per item to this number. Default: 100>
+#s|similarityClassname = <Name of distributed similarity class to instantiate. Default: org.apache.mahout.math.hadoop.similarity.vector.DistributedCooccurrenceVectorSimilarity>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/rowid.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/rowid.props b/community/mahout-mr/conf/rowid.props
new file mode 100644
index 0000000..a1a040e
--- /dev/null
+++ b/community/mahout-mr/conf/rowid.props
@@ -0,0 +1,2 @@
+#i|input =
+#o|output =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/rowsimilarity.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/rowsimilarity.props b/community/mahout-mr/conf/rowsimilarity.props
new file mode 100644
index 0000000..4774bcd
--- /dev/null
+++ b/community/mahout-mr/conf/rowsimilarity.props
@@ -0,0 +1,8 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+#r|numberOfColumns = <Number of columns in the input matrix>
+#s|similarityClassname = <Name of distributed similarity class to instantiate>
+
+# The following parameters all have default values if not specified
+#m|maxSimilaritiesPerRow = <Number of maximum similarities per row. Default: 100>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/runlogistic.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/runlogistic.props b/community/mahout-mr/conf/runlogistic.props
new file mode 100644
index 0000000..0519ecb
--- /dev/null
+++ b/community/mahout-mr/conf/runlogistic.props
@@ -0,0 +1 @@
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/seq2sparse.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/seq2sparse.props b/community/mahout-mr/conf/seq2sparse.props
new file mode 100644
index 0000000..a50a139
--- /dev/null
+++ b/community/mahout-mr/conf/seq2sparse.props
@@ -0,0 +1,15 @@
+#o|output =
+#i|input =
+#s|minSupport =
+#a|analyzerName =
+#chunk|chunkSize =
+#md|minDF =
+#x|maxDFPercent =
+#wt|weight =
+#n|norm =
+#ml|minLLR =
+#nr|numReducers =
+#ng|maxNGramSize =
+#w|overwrite =
+#h|help =
+#seq|sequentialAccessVector =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/seqdirectory.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/seqdirectory.props b/community/mahout-mr/conf/seqdirectory.props
new file mode 100644
index 0000000..21c0d8f
--- /dev/null
+++ b/community/mahout-mr/conf/seqdirectory.props
@@ -0,0 +1,3 @@
+#i|input =
+#o|output =
+#c|charset =

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/seqdumper.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/seqdumper.props b/community/mahout-mr/conf/seqdumper.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/seqwiki.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/seqwiki.props b/community/mahout-mr/conf/seqwiki.props
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/splitDataset.props
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/splitDataset.props b/community/mahout-mr/conf/splitDataset.props
new file mode 100644
index 0000000..e69de29

r***@apache.org

2018-06-27 14:52:15 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java
new file mode 100644
index 0000000..3ae9990
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCInMemoryItemSimilarity.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.jdbc;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+import org.apache.mahout.cf.taste.impl.common.jdbc.ResultSetIterator;
+import org.apache.mahout.cf.taste.impl.model.jdbc.ConnectionPoolDataSource;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.sql.DataSource;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.concurrent.locks.ReentrantLock;
+
+/**
+ * loads all similarities from the database into RAM
+ */
+abstract class AbstractJDBCInMemoryItemSimilarity extends AbstractJDBCComponent implements ItemSimilarity {
+
+ private ItemSimilarity delegate;
+
+ private final DataSource dataSource;
+ private final String getAllItemSimilaritiesSQL;
+ private final ReentrantLock reloadLock;
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractJDBCInMemoryItemSimilarity.class);
+
+ AbstractJDBCInMemoryItemSimilarity(DataSource dataSource, String getAllItemSimilaritiesSQL) {
+
+ AbstractJDBCComponent.checkNotNullAndLog("getAllItemSimilaritiesSQL", getAllItemSimilaritiesSQL);
+
+ if (!(dataSource instanceof ConnectionPoolDataSource)) {
+ log.warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
+ + "to the database itself, or database performance will be severely reduced.");
+ }
+
+ this.dataSource = dataSource;
+ this.getAllItemSimilaritiesSQL = getAllItemSimilaritiesSQL;
+ this.reloadLock = new ReentrantLock();
+
+ reload();
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ return delegate.itemSimilarity(itemID1, itemID2);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ return delegate.itemSimilarities(itemID1, itemID2s);
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) throws TasteException {
+ return delegate.allSimilarItemIDs(itemID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ log.debug("Reloading...");
+ reload();
+ }
+
+ protected void reload() {
+ if (reloadLock.tryLock()) {
+ try {
+ delegate = new GenericItemSimilarity(new JDBCSimilaritiesIterable(dataSource, getAllItemSimilaritiesSQL));
+ } finally {
+ reloadLock.unlock();
+ }
+ }
+ }
+
+ private static final class JDBCSimilaritiesIterable implements Iterable<GenericItemSimilarity.ItemItemSimilarity> {
+
+ private final DataSource dataSource;
+ private final String getAllItemSimilaritiesSQL;
+
+ private JDBCSimilaritiesIterable(DataSource dataSource, String getAllItemSimilaritiesSQL) {
+ this.dataSource = dataSource;
+ this.getAllItemSimilaritiesSQL = getAllItemSimilaritiesSQL;
+ }
+
+ @Override
+ public Iterator<GenericItemSimilarity.ItemItemSimilarity> iterator() {
+ try {
+ return new JDBCSimilaritiesIterator(dataSource, getAllItemSimilaritiesSQL);
+ } catch (SQLException sqle) {
+ throw new IllegalStateException(sqle);
+ }
+ }
+ }
+
+ private static final class JDBCSimilaritiesIterator
+ extends ResultSetIterator<GenericItemSimilarity.ItemItemSimilarity> {
+
+ private JDBCSimilaritiesIterator(DataSource dataSource, String getAllItemSimilaritiesSQL) throws SQLException {
+ super(dataSource, getAllItemSimilaritiesSQL);
+ }
+
+ @Override
+ protected GenericItemSimilarity.ItemItemSimilarity parseElement(ResultSet resultSet) throws SQLException {
+ return new GenericItemSimilarity.ItemItemSimilarity(resultSet.getLong(1),
+ resultSet.getLong(2),
+ resultSet.getDouble(3));
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java
new file mode 100644
index 0000000..1b8d109
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/AbstractJDBCItemSimilarity.java
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.jdbc;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Collection;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+import org.apache.mahout.cf.taste.impl.model.jdbc.ConnectionPoolDataSource;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An {@link ItemSimilarity} which draws pre-computed item-item similarities from a database table via JDBC.
+ */
+public abstract class AbstractJDBCItemSimilarity extends AbstractJDBCComponent implements ItemSimilarity {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractJDBCItemSimilarity.class);
+
+ static final String DEFAULT_SIMILARITY_TABLE = "taste_item_similarity";
+ static final String DEFAULT_ITEM_A_ID_COLUMN = "item_id_a";
+ static final String DEFAULT_ITEM_B_ID_COLUMN = "item_id_b";
+ static final String DEFAULT_SIMILARITY_COLUMN = "similarity";
+
+ private final DataSource dataSource;
+ private final String similarityTable;
+ private final String itemAIDColumn;
+ private final String itemBIDColumn;
+ private final String similarityColumn;
+ private final String getItemItemSimilaritySQL;
+ private final String getAllSimilarItemIDsSQL;
+
+ protected AbstractJDBCItemSimilarity(DataSource dataSource,
+ String getItemItemSimilaritySQL,
+ String getAllSimilarItemIDsSQL) {
+ this(dataSource,
+ DEFAULT_SIMILARITY_TABLE,
+ DEFAULT_ITEM_A_ID_COLUMN,
+ DEFAULT_ITEM_B_ID_COLUMN,
+ DEFAULT_SIMILARITY_COLUMN,
+ getItemItemSimilaritySQL,
+ getAllSimilarItemIDsSQL);
+ }
+
+ protected AbstractJDBCItemSimilarity(DataSource dataSource,
+ String similarityTable,
+ String itemAIDColumn,
+ String itemBIDColumn,
+ String similarityColumn,
+ String getItemItemSimilaritySQL,
+ String getAllSimilarItemIDsSQL) {
+ AbstractJDBCComponent.checkNotNullAndLog("similarityTable", similarityTable);
+ AbstractJDBCComponent.checkNotNullAndLog("itemAIDColumn", itemAIDColumn);
+ AbstractJDBCComponent.checkNotNullAndLog("itemBIDColumn", itemBIDColumn);
+ AbstractJDBCComponent.checkNotNullAndLog("similarityColumn", similarityColumn);
+
+ AbstractJDBCComponent.checkNotNullAndLog("getItemItemSimilaritySQL", getItemItemSimilaritySQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getAllSimilarItemIDsSQL", getAllSimilarItemIDsSQL);
+
+ if (!(dataSource instanceof ConnectionPoolDataSource)) {
+ log.warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
+ + "to the database itself, or database performance will be severely reduced.");
+ }
+
+ this.dataSource = dataSource;
+ this.similarityTable = similarityTable;
+ this.itemAIDColumn = itemAIDColumn;
+ this.itemBIDColumn = itemBIDColumn;
+ this.similarityColumn = similarityColumn;
+ this.getItemItemSimilaritySQL = getItemItemSimilaritySQL;
+ this.getAllSimilarItemIDsSQL = getAllSimilarItemIDsSQL;
+ }
+
+ protected String getSimilarityTable() {
+ return similarityTable;
+ }
+
+ protected String getItemAIDColumn() {
+ return itemAIDColumn;
+ }
+
+ protected String getItemBIDColumn() {
+ return itemBIDColumn;
+ }
+
+ protected String getSimilarityColumn() {
+ return similarityColumn;
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ if (itemID1 == itemID2) {
+ return 1.0;
+ }
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getItemItemSimilaritySQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+ return doItemSimilarity(stmt, itemID1, itemID2);
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving similarity", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(null, stmt, conn);
+ }
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ double[] result = new double[itemID2s.length];
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getItemItemSimilaritySQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+ for (int i = 0; i < itemID2s.length; i++) {
+ result[i] = doItemSimilarity(stmt, itemID1, itemID2s[i]);
+ }
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving item similarities", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(null, stmt, conn);
+ }
+ return result;
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) throws TasteException {
+ FastIDSet allSimilarItemIDs = new FastIDSet();
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getAllSimilarItemIDsSQL, ResultSet.TYPE_FORWARD_ONLY,
+ ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+ stmt.setLong(1, itemID);
+ stmt.setLong(2, itemID);
+ rs = stmt.executeQuery();
+ while (rs.next()) {
+ allSimilarItemIDs.add(rs.getLong(1));
+ allSimilarItemIDs.add(rs.getLong(2));
+ }
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving all similar itemIDs", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ allSimilarItemIDs.remove(itemID);
+ return allSimilarItemIDs.toArray();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // do nothing
+ }
+
+ private double doItemSimilarity(PreparedStatement stmt, long itemID1, long itemID2) throws SQLException {
+ // Order as smaller - larger
+ if (itemID1 > itemID2) {
+ long temp = itemID1;
+ itemID1 = itemID2;
+ itemID2 = temp;
+ }
+ stmt.setLong(1, itemID1);
+ stmt.setLong(2, itemID2);
+ log.debug("Executing SQL query: {}", getItemItemSimilaritySQL);
+ ResultSet rs = null;
+ try {
+ rs = stmt.executeQuery();
+ // If not found, perhaps the items exist but have no presence in the table,
+ // so NaN is appropriate
+ return rs.next() ? rs.getDouble(1) : Double.NaN;
+ } finally {
+ IOUtils.quietClose(rs);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java
new file mode 100644
index 0000000..cc831d9
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarity.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.jdbc;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+import javax.sql.DataSource;
+
+public class MySQLJDBCInMemoryItemSimilarity extends SQL92JDBCInMemoryItemSimilarity {
+
+ public MySQLJDBCInMemoryItemSimilarity() throws TasteException {
+ }
+
+ public MySQLJDBCInMemoryItemSimilarity(String dataSourceName) throws TasteException {
+ super(dataSourceName);
+ }
+
+ public MySQLJDBCInMemoryItemSimilarity(DataSource dataSource) {
+ super(dataSource);
+ }
+
+ public MySQLJDBCInMemoryItemSimilarity(DataSource dataSource, String getAllItemSimilaritiesSQL) {
+ super(dataSource, getAllItemSimilaritiesSQL);
+ }
+
+ @Override
+ protected int getFetchSize() {
+ // Need to return this for MySQL Connector/J to make it use streaming mode
+ return Integer.MIN_VALUE;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java
new file mode 100644
index 0000000..af0742e
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCItemSimilarity.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.jdbc;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * 
+ * An {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} backed by a MySQL database
+ * and accessed via JDBC. It may work with other JDBC
+ * databases. By default, this class assumes that there is a {@link DataSource} available under the JNDI name
+ * "jdbc/taste", which gives access to a database with a "taste_item_similarity" table with the following
+ * schema:
+ * 
+ *
+ * <table>
+ * <tr>
+ * <th>item_id_a</th>
+ * <th>item_id_b</th>
+ * <th>similarity</th>
+ * </tr>
+ * <tr>
+ * <td>ABC</td>
+ * <td>DEF</td>
+ * <td>0.9</td>
+ * </tr>
+ * <tr>
+ * <td>DEF</td>
+ * <td>EFG</td>
+ * <td>0.1</td>
+ * </tr>
+ * </table>
+ *
+ * 
+ * For example, the following command sets up a suitable table in MySQL, complete with primary key and
+ * indexes:
+ * 
+ *
+ * 
+ *
+ * <pre>
+ * CREATE TABLE taste_item_similarity (
+ * item_id_a BIGINT NOT NULL,
+ * item_id_b BIGINT NOT NULL,
+ * similarity FLOAT NOT NULL,
+ * PRIMARY KEY (item_id_a, item_id_b),
+ * )
+ * </pre>
+ *
+ * 
+ *
+ * 
+ * Note that for each row, item_id_a should be less than item_id_b. It is redundant to store it both ways,
+ * so the pair is always stored as a pair with the lesser one first.
+ *
+ * @see org.apache.mahout.cf.taste.impl.model.jdbc.MySQLJDBCDataModel
+ */
+public class MySQLJDBCItemSimilarity extends SQL92JDBCItemSimilarity {
+
+ public MySQLJDBCItemSimilarity() throws TasteException {
+ }
+
+ public MySQLJDBCItemSimilarity(String dataSourceName) throws TasteException {
+ super(dataSourceName);
+ }
+
+ public MySQLJDBCItemSimilarity(DataSource dataSource) {
+ super(dataSource);
+ }
+
+ public MySQLJDBCItemSimilarity(DataSource dataSource,
+ String similarityTable,
+ String itemAIDColumn,
+ String itemBIDColumn,
+ String similarityColumn) {
+ super(dataSource, similarityTable, itemAIDColumn, itemBIDColumn, similarityColumn);
+ }
+
+ @Override
+ protected int getFetchSize() {
+ // Need to return this for MySQL Connector/J to make it use streaming mode
+ return Integer.MIN_VALUE;
+ }
+
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java
new file mode 100644
index 0000000..b311a5e
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCInMemoryItemSimilarity.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.jdbc;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+
+import javax.sql.DataSource;
+
+public class SQL92JDBCInMemoryItemSimilarity extends AbstractJDBCInMemoryItemSimilarity {
+
+ static final String DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL =
+ "SELECT " + AbstractJDBCItemSimilarity.DEFAULT_ITEM_A_ID_COLUMN + ", "
+ + AbstractJDBCItemSimilarity.DEFAULT_ITEM_B_ID_COLUMN + ", "
+ + AbstractJDBCItemSimilarity.DEFAULT_SIMILARITY_COLUMN + " FROM "
+ + AbstractJDBCItemSimilarity.DEFAULT_SIMILARITY_TABLE;
+
+
+ public SQL92JDBCInMemoryItemSimilarity() throws TasteException {
+ this(AbstractJDBCComponent.lookupDataSource(AbstractJDBCComponent.DEFAULT_DATASOURCE_NAME),
+ DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL);
+ }
+
+ public SQL92JDBCInMemoryItemSimilarity(String dataSourceName) throws TasteException {
+ this(AbstractJDBCComponent.lookupDataSource(dataSourceName), DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL);
+ }
+
+ public SQL92JDBCInMemoryItemSimilarity(DataSource dataSource) {
+ this(dataSource, DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL);
+ }
+
+ public SQL92JDBCInMemoryItemSimilarity(DataSource dataSource, String getAllItemSimilaritiesSQL) {
+ super(dataSource, getAllItemSimilaritiesSQL);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java
new file mode 100644
index 0000000..f449561
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/SQL92JDBCItemSimilarity.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.jdbc;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+import javax.sql.DataSource;
+
+public class SQL92JDBCItemSimilarity extends AbstractJDBCItemSimilarity {
+
+ public SQL92JDBCItemSimilarity() throws TasteException {
+ this(DEFAULT_DATASOURCE_NAME);
+ }
+
+ public SQL92JDBCItemSimilarity(String dataSourceName) throws TasteException {
+ this(lookupDataSource(dataSourceName));
+ }
+
+ public SQL92JDBCItemSimilarity(DataSource dataSource) {
+ this(dataSource,
+ DEFAULT_SIMILARITY_TABLE,
+ DEFAULT_ITEM_A_ID_COLUMN,
+ DEFAULT_ITEM_B_ID_COLUMN,
+ DEFAULT_SIMILARITY_COLUMN);
+ }
+
+ public SQL92JDBCItemSimilarity(DataSource dataSource,
+ String similarityTable,
+ String itemAIDColumn,
+ String itemBIDColumn,
+ String similarityColumn) {
+ super(dataSource,
+ similarityTable,
+ itemAIDColumn,
+ itemBIDColumn, similarityColumn,
+ "SELECT " + similarityColumn + " FROM " + similarityTable + " WHERE "
+ + itemAIDColumn + "=? AND " + itemBIDColumn + "=?",
+ "SELECT " + itemAIDColumn + ", " + itemBIDColumn + " FROM " + similarityTable + " WHERE "
+ + itemAIDColumn + "=? OR " + itemBIDColumn + "=?");
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java
new file mode 100644
index 0000000..a5a89c6
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderServlet.java
@@ -0,0 +1,215 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.web;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+import javax.servlet.ServletConfig;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.List;
+
+/**
+ * A servlet which returns recommendations, as its name implies. The servlet accepts GET and POST
+ * HTTP requests, and looks for two parameters:
+ *
+ * <ul>
+ * <li>userID: the user ID for which to produce recommendations</li>
+ * <li>howMany: the number of recommendations to produce</li>
+ * <li>debug: (optional) output a lot of information that is useful in debugging.
+ * Defaults to false, of course.</li>
+ * </ul>
+ *
+ * The response is text, and contains a list of the IDs of recommended items, in descending
+ * order of relevance, one per line.
+ *
+ * For example, you can get 10 recommendations for user 123 from the following URL (assuming
+ * you are running taste in a web application running locally on port 8080): 
+ * {@code http://localhost:8080/taste/RecommenderServlet?userID=123&howMany=10}
+ *
+ * This servlet requires one {@code init-param} in {@code web.xml}: it must find
+ * a parameter named "recommender-class" which is the name of a class that implements
+ * {@link Recommender} and has a no-arg constructor. The servlet will instantiate and use
+ * this {@link Recommender} to produce recommendations.
+ */
+public final class RecommenderServlet extends HttpServlet {
+
+ private static final int NUM_TOP_PREFERENCES = 20;
+ private static final int DEFAULT_HOW_MANY = 20;
+
+ private Recommender recommender;
+
+ @Override
+ public void init(ServletConfig config) throws ServletException {
+ super.init(config);
+ String recommenderClassName = config.getInitParameter("recommender-class");
+ if (recommenderClassName == null) {
+ throw new ServletException("Servlet init-param \"recommender-class\" is not defined");
+ }
+ RecommenderSingleton.initializeIfNeeded(recommenderClassName);
+ recommender = RecommenderSingleton.getInstance().getRecommender();
+ }
+
+ @Override
+ public void doGet(HttpServletRequest request,
+ HttpServletResponse response) throws ServletException {
+
+ String userIDString = request.getParameter("userID");
+ if (userIDString == null) {
+ throw new ServletException("userID was not specified");
+ }
+ long userID = Long.parseLong(userIDString);
+ String howManyString = request.getParameter("howMany");
+ int howMany = howManyString == null ? DEFAULT_HOW_MANY : Integer.parseInt(howManyString);
+ boolean debug = Boolean.parseBoolean(request.getParameter("debug"));
+ String format = request.getParameter("format");
+ if (format == null) {
+ format = "text";
+ }
+
+ try {
+ List<RecommendedItem> items = recommender.recommend(userID, howMany);
+ if ("text".equals(format)) {
+ writePlainText(response, userID, debug, items);
+ } else if ("xml".equals(format)) {
+ writeXML(response, items);
+ } else if ("json".equals(format)) {
+ writeJSON(response, items);
+ } else {
+ throw new ServletException("Bad format parameter: " + format);
+ }
+ } catch (TasteException | IOException te) {
+ throw new ServletException(te);
+ }
+
+ }
+
+ private static void writeXML(HttpServletResponse response, Iterable<RecommendedItem> items) throws IOException {
+ response.setContentType("application/xml");
+ response.setCharacterEncoding("UTF-8");
+ response.setHeader("Cache-Control", "no-cache");
+ PrintWriter writer = response.getWriter();
+ writer.print("<?xml version=\"1.0\" encoding=\"UTF-8\"?><recommendedItems>");
+ for (RecommendedItem recommendedItem : items) {
+ writer.print("<item><value>");
+ writer.print(recommendedItem.getValue());
+ writer.print("</value><id>");
+ writer.print(recommendedItem.getItemID());
+ writer.print("</id></item>");
+ }
+ writer.println("</recommendedItems>");
+ }
+
+ private static void writeJSON(HttpServletResponse response, Iterable<RecommendedItem> items) throws IOException {
+ response.setContentType("application/json");
+ response.setCharacterEncoding("UTF-8");
+ response.setHeader("Cache-Control", "no-cache");
+ PrintWriter writer = response.getWriter();
+ writer.print("{\"recommendedItems\":{\"item\":[");
+ boolean first = true;
+ for (RecommendedItem recommendedItem : items) {
+ if (first) {
+ first = false;
+ } else {
+ writer.print(',');
+ }
+ writer.print("{\"value\":\"");
+ writer.print(recommendedItem.getValue());
+ writer.print("\",\"id\":\"");
+ writer.print(recommendedItem.getItemID());
+ writer.print("\"}");
+ }
+ writer.println("]}}");
+ }
+
+ private void writePlainText(HttpServletResponse response,
+ long userID,
+ boolean debug,
+ Iterable<RecommendedItem> items) throws IOException, TasteException {
+ response.setContentType("text/plain");
+ response.setCharacterEncoding("UTF-8");
+ response.setHeader("Cache-Control", "no-cache");
+ PrintWriter writer = response.getWriter();
+ if (debug) {
+ writeDebugRecommendations(userID, items, writer);
+ } else {
+ writeRecommendations(items, writer);
+ }
+ }
+
+ private static void writeRecommendations(Iterable<RecommendedItem> items, PrintWriter writer) {
+ for (RecommendedItem recommendedItem : items) {
+ writer.print(recommendedItem.getValue());
+ writer.print('\t');
+ writer.println(recommendedItem.getItemID());
+ }
+ }
+
+ private void writeDebugRecommendations(long userID, Iterable<RecommendedItem> items, PrintWriter writer)
+ throws TasteException {
+ DataModel dataModel = recommender.getDataModel();
+ writer.print("User:");
+ writer.println(userID);
+ writer.print("Recommender: ");
+ writer.println(recommender);
+ writer.println();
+ writer.print("Top ");
+ writer.print(NUM_TOP_PREFERENCES);
+ writer.println(" Preferences:");
+ PreferenceArray rawPrefs = dataModel.getPreferencesFromUser(userID);
+ int length = rawPrefs.length();
+ PreferenceArray sortedPrefs = rawPrefs.clone();
+ sortedPrefs.sortByValueReversed();
+ // Cap this at NUM_TOP_PREFERENCES just to be brief
+ int max = Math.min(NUM_TOP_PREFERENCES, length);
+ for (int i = 0; i < max; i++) {
+ Preference pref = sortedPrefs.get(i);
+ writer.print(pref.getValue());
+ writer.print('\t');
+ writer.println(pref.getItemID());
+ }
+ writer.println();
+ writer.println("Recommendations:");
+ for (RecommendedItem recommendedItem : items) {
+ writer.print(recommendedItem.getValue());
+ writer.print('\t');
+ writer.println(recommendedItem.getItemID());
+ }
+ }
+
+ @Override
+ public void doPost(HttpServletRequest request,
+ HttpServletResponse response) throws ServletException {
+ doGet(request, response);
+ }
+
+ @Override
+ public String toString() {
+ return "RecommenderServlet[recommender:" + recommender + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java
new file mode 100644
index 0000000..265d7c0
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderSingleton.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.web;
+
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.ClassUtils;
+
+/**
+ * A singleton which holds an instance of a {@link Recommender}. This is used to share
+ * a {@link Recommender} between {@link RecommenderServlet} and {@code RecommenderService.jws}.
+ */
+public final class RecommenderSingleton {
+
+ private final Recommender recommender;
+
+ private static RecommenderSingleton instance;
+
+ public static synchronized RecommenderSingleton getInstance() {
+ if (instance == null) {
+ throw new IllegalStateException("Not initialized");
+ }
+ return instance;
+ }
+
+ public static synchronized void initializeIfNeeded(String recommenderClassName) {
+ if (instance == null) {
+ instance = new RecommenderSingleton(recommenderClassName);
+ }
+ }
+
+ private RecommenderSingleton(String recommenderClassName) {
+ if (recommenderClassName == null) {
+ throw new IllegalArgumentException("Recommender class name is null");
+ }
+ recommender = ClassUtils.instantiateAs(recommenderClassName, Recommender.class);
+ }
+
+ public Recommender getRecommender() {
+ return recommender;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java
new file mode 100644
index 0000000..e927098
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/web/RecommenderWrapper.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.web;
+
+import com.google.common.io.Files;
+import com.google.common.io.InputSupplier;
+import com.google.common.io.Resources;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Users of the packaging and deployment mechanism in this module need
+ * to produce a {@link Recommender} implementation with a no-arg constructor,
+ * which will internally build the desired {@link Recommender} and delegate
+ * to it. This wrapper simplifies that process. Simply extend this class and
+ * implement {@link #buildRecommender()}.
+ */
+public abstract class RecommenderWrapper implements Recommender {
+
+ private static final Logger log = LoggerFactory.getLogger(RecommenderWrapper.class);
+
+ private final Recommender delegate;
+
+ protected RecommenderWrapper() throws TasteException, IOException {
+ this.delegate = buildRecommender();
+ }
+
+ /**
+ * @return the {@link Recommender} which should be used to produce recommendations
+ * by this wrapper implementation
+ */
+ protected abstract Recommender buildRecommender() throws IOException, TasteException;
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+ return delegate.recommend(userID, howMany);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+ return delegate.recommend(userID, howMany, rescorer);
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ return delegate.estimatePreference(userID, itemID);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ delegate.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ delegate.removePreference(userID, itemID);
+ }
+
+ @Override
+ public DataModel getDataModel() {
+ return delegate.getDataModel();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ delegate.refresh(alreadyRefreshed);
+ }
+
+ /**
+ * Reads the given resource into a temporary file. This is intended to be used
+ * to read data files which are stored as a resource available on the classpath,
+ * such as in a JAR file. However for convenience the resource name will also
+ * be interpreted as a relative path to a local file, if no such resource is
+ * found. This facilitates testing.
+ *
+ * @param resourceName name of resource in classpath, or relative path to file
+ * @return temporary {@link File} with resource data
+ * @throws IOException if an error occurs while reading or writing data
+ */
+ public static File readResourceToTempFile(String resourceName) throws IOException {
+ String absoluteResource = resourceName.startsWith("/") ? resourceName : '/' + resourceName;
+ log.info("Loading resource {}", absoluteResource);
+ InputSupplier<? extends InputStream> inSupplier;
+ try {
+ URL resourceURL = Resources.getResource(RecommenderWrapper.class, absoluteResource);
+ inSupplier = Resources.newInputStreamSupplier(resourceURL);
+ } catch (IllegalArgumentException iae) {
+ File resourceFile = new File(resourceName);
+ log.info("Falling back to load file {}", resourceFile.getAbsolutePath());
+ inSupplier = Files.newInputStreamSupplier(resourceFile);
+ }
+ File tempFile = File.createTempFile("taste", null);
+ tempFile.deleteOnExit();
+ Files.copy(inSupplier, tempFile);
+ return tempFile;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java
new file mode 100644
index 0000000..03a3000
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/classifier/ConfusionMatrixDumper.java
@@ -0,0 +1,425 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Export a ConfusionMatrix in various text formats: ToString version Grayscale HTML table Summary HTML table
+ * Table of counts all with optional HTML wrappers
+ *
+ * Input format: Hadoop SequenceFile with Text key and MatrixWritable value, 1 pair
+ *
+ * Intended to consume ConfusionMatrix SequenceFile output by Bayes TestClassifier class
+ */
+public final class ConfusionMatrixDumper extends AbstractJob {
+
+ private static final String TAB_SEPARATOR = "|";
+
+ // HTML wrapper - default CSS
+ private static final String HEADER = "<html>"
+ + "<head>\n"
+ + "<title>TITLE</title>\n"
+ + "</head>"
+ + "<body>\n"
+ + "<style type='text/css'> \n"
+ + "table\n"
+ + "{\n"
+ + "border:3px solid black; text-align:left;\n"
+ + "}\n"
+ + "th.normalHeader\n"
+ + "{\n"
+ + "border:1px solid black;border-collapse:collapse;text-align:center;"
+ + "background-color:white\n"
+ + "}\n"
+ + "th.tallHeader\n"
+ + "{\n"
+ + "border:1px solid black;border-collapse:collapse;text-align:center;"
+ + "background-color:white; height:6em\n"
+ + "}\n"
+ + "tr.label\n"
+ + "{\n"
+ + "border:1px solid black;border-collapse:collapse;text-align:center;"
+ + "background-color:white\n"
+ + "}\n"
+ + "tr.row\n"
+ + "{\n"
+ + "border:1px solid gray;text-align:center;background-color:snow\n"
+ + "}\n"
+ + "td\n"
+ + "{\n"
+ + "min-width:2em\n"
+ + "}\n"
+ + "td.cell\n"
+ + "{\n"
+ + "border:1px solid black;text-align:right;background-color:snow\n"
+ + "}\n"
+ + "td.empty\n"
+ + "{\n"
+ + "border:0px;text-align:right;background-color:snow\n"
+ + "}\n"
+ + "td.white\n"
+ + "{\n"
+ + "border:0px solid black;text-align:right;background-color:white\n"
+ + "}\n"
+ + "td.black\n"
+ + "{\n"
+ + "border:0px solid red;text-align:right;background-color:black\n"
+ + "}\n"
+ + "td.gray1\n"
+ + "{\n"
+ + "border:0px solid green;text-align:right; background-color:LightGray\n"
+ + "}\n" + "td.gray2\n" + "{\n"
+ + "border:0px solid blue;text-align:right;background-color:gray\n"
+ + "}\n" + "td.gray3\n" + "{\n"
+ + "border:0px solid red;text-align:right;background-color:DarkGray\n"
+ + "}\n" + "th" + "{\n" + " text-align: center;\n"
+ + " vertical-align: bottom;\n"
+ + " padding-bottom: 3px;\n" + " padding-left: 5px;\n"
+ + " padding-right: 5px;\n" + "}\n" + " .verticalText\n"
+ + " {\n" + " text-align: center;\n"
+ + " vertical-align: middle;\n" + " width: 20px;\n"
+ + " margin: 0px;\n" + " padding: 0px;\n"
+ + " padding-left: 3px;\n" + " padding-right: 3px;\n"
+ + " padding-top: 10px;\n" + " white-space: nowrap;\n"
+ + " -webkit-transform: rotate(-90deg); \n"
+ + " -moz-transform: rotate(-90deg); \n" + " };\n"
+ + "</style>\n";
+ private static final String FOOTER = "</html></body>";
+
+ // CSS style names.
+ private static final String CSS_TABLE = "table";
+ private static final String CSS_LABEL = "label";
+ private static final String CSS_TALL_HEADER = "tall";
+ private static final String CSS_VERTICAL = "verticalText";
+ private static final String CSS_CELL = "cell";
+ private static final String CSS_EMPTY = "empty";
+ private static final String[] CSS_GRAY_CELLS = {"white", "gray1", "gray2", "gray3", "black"};
+
+ private ConfusionMatrixDumper() {}
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new ConfusionMatrixDumper(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws IOException {
+ addInputOption();
+ addOption("output", "o", "Output path", null); // AbstractJob output feature requires param
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addFlag("html", null, "Create complete HTML page");
+ addFlag("text", null, "Dump simple text");
+ Map<String,List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ Path inputPath = getInputPath();
+ String outputFile = hasOption("output") ? getOption("output") : null;
+ boolean text = parsedArgs.containsKey("--text");
+ boolean wrapHtml = parsedArgs.containsKey("--html");
+ PrintStream out = getPrintStream(outputFile);
+ if (text) {
+ exportText(inputPath, out);
+ } else {
+ exportTable(inputPath, out, wrapHtml);
+ }
+ out.flush();
+ if (out != System.out) {
+ out.close();
+ }
+ return 0;
+ }
+
+ private static void exportText(Path inputPath, PrintStream out) throws IOException {
+ MatrixWritable mw = new MatrixWritable();
+ Text key = new Text();
+ readSeqFile(inputPath, key, mw);
+ Matrix m = mw.get();
+ ConfusionMatrix cm = new ConfusionMatrix(m);
+ out.println(String.format("%-40s", "Label") + TAB_SEPARATOR + String.format("%-10s", "Total")
+ + TAB_SEPARATOR + String.format("%-10s", "Correct") + TAB_SEPARATOR
+ + String.format("%-6s", "%") + TAB_SEPARATOR);
+ out.println(String.format("%-70s", "-").replace(' ', '-'));
+ List<String> labels = stripDefault(cm);
+ for (String label : labels) {
+ int correct = cm.getCorrect(label);
+ double accuracy = cm.getAccuracy(label);
+ int count = getCount(cm, label);
+ out.println(String.format("%-40s", label) + TAB_SEPARATOR + String.format("%-10s", count)
+ + TAB_SEPARATOR + String.format("%-10s", correct) + TAB_SEPARATOR
+ + String.format("%-6s", (int) Math.round(accuracy)) + TAB_SEPARATOR);
+ }
+ out.println(String.format("%-70s", "-").replace(' ', '-'));
+ out.println(cm.toString());
+ }
+
+ private static void exportTable(Path inputPath, PrintStream out, boolean wrapHtml) throws IOException {
+ MatrixWritable mw = new MatrixWritable();
+ Text key = new Text();
+ readSeqFile(inputPath, key, mw);
+ String fileName = inputPath.getName();
+ fileName = fileName.substring(fileName.lastIndexOf('/') + 1, fileName.length());
+ Matrix m = mw.get();
+ ConfusionMatrix cm = new ConfusionMatrix(m);
+ if (wrapHtml) {
+ printHeader(out, fileName);
+ }
+ out.println("");
+ printSummaryTable(cm, out);
+ out.println("");
+ printGrayTable(cm, out);
+ out.println("");
+ printCountsTable(cm, out);
+ out.println("");
+ printTextInBox(cm, out);
+ out.println("");
+ if (wrapHtml) {
+ printFooter(out);
+ }
+ }
+
+ private static List<String> stripDefault(ConfusionMatrix cm) {
+ List<String> stripped = Lists.newArrayList(cm.getLabels().iterator());
+ String defaultLabel = cm.getDefaultLabel();
+ int unclassified = cm.getTotal(defaultLabel);
+ if (unclassified > 0) {
+ return stripped;
+ }
+ stripped.remove(defaultLabel);
+ return stripped;
+ }
+
+ // TODO: test - this should work with HDFS files
+ private static void readSeqFile(Path path, Text key, MatrixWritable m) throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+ reader.next(key, m);
+ }
+
+ // TODO: test - this might not work with HDFS files?
+ // after all, it does no seeks
+ private static PrintStream getPrintStream(String outputFilename) throws IOException {
+ if (outputFilename != null) {
+ File outputFile = new File(outputFilename);
+ if (outputFile.exists()) {
+ outputFile.delete();
+ }
+ outputFile.createNewFile();
+ OutputStream os = new FileOutputStream(outputFile);
+ return new PrintStream(os, false, Charsets.UTF_8.displayName());
+ } else {
+ return System.out;
+ }
+ }
+
+ private static int getLabelTotal(ConfusionMatrix cm, String rowLabel) {
+ Iterator<String> iter = cm.getLabels().iterator();
+ int count = 0;
+ while (iter.hasNext()) {
+ count += cm.getCount(rowLabel, iter.next());
+ }
+ return count;
+ }
+
+ // HTML generator code
+
+ private static void printTextInBox(ConfusionMatrix cm, PrintStream out) {
+ out.println("<div style='width:90%;overflow:scroll;'>");
+ out.println("<pre>");
+ out.println(cm.toString());
+ out.println("</pre>");
+ out.println("</div>");
+ }
+
+ public static void printSummaryTable(ConfusionMatrix cm, PrintStream out) {
+ format("<table class='%s'>\n", out, CSS_TABLE);
+ format("<tr class='%s'>", out, CSS_LABEL);
+ out.println("<td>Label</td><td>Total</td><td>Correct</td><td>%</td>");
+ out.println("</tr>");
+ List<String> labels = stripDefault(cm);
+ for (String label : labels) {
+ printSummaryRow(cm, out, label);
+ }
+ out.println("</table>");
+ }
+
+ private static void printSummaryRow(ConfusionMatrix cm, PrintStream out, String label) {
+ format("<tr class='%s'>", out, CSS_CELL);
+ int correct = cm.getCorrect(label);
+ double accuracy = cm.getAccuracy(label);
+ int count = getCount(cm, label);
+ format("<td class='%s'>%s</td><td>%d</td><td>%d</td><td>%d</td>", out, CSS_CELL, label, count, correct,
+ (int) Math.round(accuracy));
+ out.println("</tr>");
+ }
+
+ private static int getCount(ConfusionMatrix cm, String label) {
+ int count = 0;
+ for (String s : cm.getLabels()) {
+ count += cm.getCount(label, s);
+ }
+ return count;
+ }
+
+ public static void printGrayTable(ConfusionMatrix cm, PrintStream out) {
+ format("<table class='%s'>\n", out, CSS_TABLE);
+ printCountsHeader(cm, out, true);
+ printGrayRows(cm, out);
+ out.println("</table>");
+ }
+
+ /**
+ * Print each value in a four-value grayscale based on count/max. Gives a mostly white matrix with grays in
+ * misclassified, and black in diagonal. TODO: Using the sqrt(count/max) as the rating is more stringent
+ */
+ private static void printGrayRows(ConfusionMatrix cm, PrintStream out) {
+ List<String> labels = stripDefault(cm);
+ for (String label : labels) {
+ printGrayRow(cm, out, labels, label);
+ }
+ }
+
+ private static void printGrayRow(ConfusionMatrix cm,
+ PrintStream out,
+ Iterable<String> labels,
+ String rowLabel) {
+ format("<tr class='%s'>", out, CSS_LABEL);
+ format("<td>%s</td>", out, rowLabel);
+ int total = getLabelTotal(cm, rowLabel);
+ for (String columnLabel : labels) {
+ printGrayCell(cm, out, total, rowLabel, columnLabel);
+ }
+ out.println("</tr>");
+ }
+
+ // assign white/light/medium/dark to 0,1/4,1/2,3/4 of total number of inputs
+ // assign black to count = total, meaning complete success
+ // alternative rating is to use sqrt(total) instead of total - this is more drastic
+ private static void printGrayCell(ConfusionMatrix cm,
+ PrintStream out,
+ int total,
+ String rowLabel,
+ String columnLabel) {
+
+ int count = cm.getCount(rowLabel, columnLabel);
+ if (count == 0) {
+ out.format("<td class='%s'/>", CSS_EMPTY);
+ } else {
+ // 0 is white, full is black, everything else gray
+ int rating = (int) ((count / (double) total) * 4);
+ String css = CSS_GRAY_CELLS[rating];
+ format("<td class='%s' title='%s'>%s</td>", out, css, columnLabel, count);
+ }
+ }
+
+ public static void printCountsTable(ConfusionMatrix cm, PrintStream out) {
+ format("<table class='%s'>\n", out, CSS_TABLE);
+ printCountsHeader(cm, out, false);
+ printCountsRows(cm, out);
+ out.println("</table>");
+ }
+
+ private static void printCountsRows(ConfusionMatrix cm, PrintStream out) {
+ List<String> labels = stripDefault(cm);
+ for (String label : labels) {
+ printCountsRow(cm, out, labels, label);
+ }
+ }
+
+ private static void printCountsRow(ConfusionMatrix cm,
+ PrintStream out,
+ Iterable<String> labels,
+ String rowLabel) {
+ out.println("<tr>");
+ format("<td class='%s'>%s</td>", out, CSS_LABEL, rowLabel);
+ for (String columnLabel : labels) {
+ printCountsCell(cm, out, rowLabel, columnLabel);
+ }
+ out.println("</tr>");
+ }
+
+ private static void printCountsCell(ConfusionMatrix cm, PrintStream out, String rowLabel, String columnLabel) {
+ int count = cm.getCount(rowLabel, columnLabel);
+ String s = count == 0 ? "" : Integer.toString(count);
+ format("<td class='%s' title='%s'>%s</td>", out, CSS_CELL, columnLabel, s);
+ }
+
+ private static void printCountsHeader(ConfusionMatrix cm, PrintStream out, boolean vertical) {
+ List<String> labels = stripDefault(cm);
+ int longest = getLongestHeader(labels);
+ if (vertical) {
+ // do vertical - rotation is a bitch
+ out.format("<tr class='%s' style='height:%dem'><th> </th>%n", CSS_TALL_HEADER, longest / 2);
+ for (String label : labels) {
+ out.format("<th><div class='%s'>%s</div></th>", CSS_VERTICAL, label);
+ }
+ out.println("</tr>");
+ } else {
+ // header - empty cell in upper left
+ out.format("<tr class='%s'><td class='%s'></td>%n", CSS_TABLE, CSS_LABEL);
+ for (String label : labels) {
+ out.format("<td>%s</td>", label);
+ }
+ out.format("</tr>");
+ }
+ }
+
+ private static int getLongestHeader(Iterable<String> labels) {
+ int max = 0;
+ for (String label : labels) {
+ max = Math.max(label.length(), max);
+ }
+ return max;
+ }
+
+ private static void format(String format, PrintStream out, Object... args) {
+ String format2 = String.format(format, args);
+ out.println(format2);
+ }
+
+ public static void printHeader(PrintStream out, CharSequence title) {
+ out.println(HEADER.replace("TITLE", title));
+ }
+
+ public static void printFooter(PrintStream out) {
+ out.println(FOOTER);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
new file mode 100644
index 0000000..545c1ff
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
@@ -0,0 +1,387 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.cdbw;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.GaussianAccumulator;
+import org.apache.mahout.clustering.OnlineGaussianAccumulator;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * This class calculates the CDbw metric as defined in
+ * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf
+ */
+public final class CDbwEvaluator {
+
+ private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class);
+
+ private final Map<Integer,List<VectorWritable>> representativePoints;
+ private final Map<Integer,Double> stDevs = new HashMap<>();
+ private final List<Cluster> clusters;
+ private final DistanceMeasure measure;
+ private Double interClusterDensity = null;
+ // these are symmetric so we only compute half of them
+ private Map<Integer,Map<Integer,Double>> minimumDistances = null;
+ // these are symmetric too
+ private Map<Integer,Map<Integer,Double>> interClusterDensities = null;
+ // these are symmetric too
+ private Map<Integer,Map<Integer,int[]>> closestRepPointIndices = null;
+
+ /**
+ * For testing only
+ *
+ * @param representativePoints
+ * a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
+ * @param clusters
+ * a Map<Integer,Cluster> of the clusters keyed by clusterId
+ * @param measure
+ * an appropriate DistanceMeasure
+ */
+ public CDbwEvaluator(Map<Integer,List<VectorWritable>> representativePoints, List<Cluster> clusters,
+ DistanceMeasure measure) {
+ this.representativePoints = representativePoints;
+ this.clusters = clusters;
+ this.measure = measure;
+ for (Integer cId : representativePoints.keySet()) {
+ computeStd(cId);
+ }
+ }
+
+ /**
+ * Initialize a new instance from job information
+ *
+ * @param conf
+ * a Configuration with appropriate parameters
+ * @param clustersIn
+ * a String path to the input clusters directory
+ */
+ public CDbwEvaluator(Configuration conf, Path clustersIn) {
+ measure = ClassUtils
+ .instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
+ representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
+ clusters = loadClusters(conf, clustersIn);
+ for (Integer cId : representativePoints.keySet()) {
+ computeStd(cId);
+ }
+ }
+
+ /**
+ * Load the clusters from their sequence files
+ *
+ * @param clustersIn
+ * a String pathname to the directory containing input cluster files
+ * @return a List<Cluster> of the clusters
+ */
+ private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) {
+ List<Cluster> clusters = new ArrayList<>();
+ for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
+ PathFilters.logsCRCFilter(), conf)) {
+ Cluster cluster = clusterWritable.getValue();
+ clusters.add(cluster);
+ }
+ return clusters;
+ }
+
+ /**
+ * Compute the standard deviation of the representative points for the given cluster. Store these in stDevs, indexed
+ * by cI
+ *
+ * @param cI
+ * a int clusterId.
+ */
+ private void computeStd(int cI) {
+ List<VectorWritable> repPts = representativePoints.get(cI);
+ GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
+ for (VectorWritable vw : repPts) {
+ accumulator.observe(vw.get(), 1.0);
+ }
+ accumulator.compute();
+ double d = accumulator.getAverageStd();
+ stDevs.put(cI, d);
+ }
+
+ /**
+ * Compute the density of points near the midpoint between the two closest points of the clusters (eqn 2) used for
+ * inter-cluster density calculation
+ *
+ * @param uIJ
+ * the Vector midpoint between the closest representative points of the clusters
+ * @param cI
+ * the int clusterId of the i-th cluster
+ * @param cJ
+ * the int clusterId of the j-th cluster
+ * @param avgStd
+ * the double average standard deviation of the two clusters
+ * @return a double
+ */
+ private double density(Vector uIJ, int cI, int cJ, double avgStd) {
+ List<VectorWritable> repI = representativePoints.get(cI);
+ List<VectorWritable> repJ = representativePoints.get(cJ);
+ double sum = 0.0;
+ // count the number of representative points of the clusters which are within the
+ // average std of the two clusters from the midpoint uIJ (eqn 3)
+ for (VectorWritable vwI : repI) {
+ if (uIJ != null && measure.distance(uIJ, vwI.get()) <= avgStd) {
+ sum++;
+ }
+ }
+ for (VectorWritable vwJ : repJ) {
+ if (uIJ != null && measure.distance(uIJ, vwJ.get()) <= avgStd) {
+ sum++;
+ }
+ }
+ int nI = repI.size();
+ int nJ = repJ.size();
+ return sum / (nI + nJ);
+ }
+
+ /**
+ * Compute the CDbw validity metric (eqn 8). The goal of this metric is to reward clusterings which have a high
+ * intraClusterDensity and also a high cluster separation.
+ *
+ * @return a double
+ */
+ public double getCDbw() {
+ return intraClusterDensity() * separation();
+ }
+
+ /**
+ * The average density within clusters is defined as the percentage of representative points that reside in the
+ * neighborhood of the clusters' centers. The goal is the density within clusters to be significantly high. (eqn 5)
+ *
+ * @return a double
+ */
+ public double intraClusterDensity() {
+ double avgDensity = 0;
+ int count = 0;
+ for (Element elem : intraClusterDensities().nonZeroes()) {
+ double value = elem.get();
+ if (!Double.isNaN(value)) {
+ avgDensity += value;
+ count++;
+ }
+ }
+ return avgDensity / count;
+ }
+
+ /**
+ * This function evaluates the density of points in the regions between each clusters (eqn 1). The goal is the density
+ * in the area between clusters to be significant low.
+ *
+ * @return a Map<Integer,Map<Integer,Double>> of the inter-cluster densities
+ */
+ public Map<Integer,Map<Integer,Double>> interClusterDensities() {
+ if (interClusterDensities != null) {
+ return interClusterDensities;
+ }
+ interClusterDensities = new TreeMap<>();
+ // find the closest representative points between the clusters
+ for (int i = 0; i < clusters.size(); i++) {
+ int cI = clusters.get(i).getId();
+ Map<Integer,Double> map = new TreeMap<>();
+ interClusterDensities.put(cI, map);
+ for (int j = i + 1; j < clusters.size(); j++) {
+ int cJ = clusters.get(j).getId();
+ double minDistance = minimumDistance(cI, cJ); // the distance between the closest representative points
+ Vector uIJ = midpointVector(cI, cJ); // the midpoint between the closest representative points
+ double stdSum = stDevs.get(cI) + stDevs.get(cJ);
+ double density = density(uIJ, cI, cJ, stdSum / 2);
+ double interDensity = minDistance * density / stdSum;
+ map.put(cJ, interDensity);
+ if (log.isDebugEnabled()) {
+ log.debug("minDistance[{},{}]={}", cI, cJ, minDistance);
+ log.debug("interDensity[{},{}]={}", cI, cJ, density);
+ log.debug("density[{},{}]={}", cI, cJ, interDensity);
+ }
+ }
+ }
+ return interClusterDensities;
+ }
+
+ /**
+ * Calculate the separation of clusters (eqn 4) taking into account both the distances between the clusters' closest
+ * points and the Inter-cluster density. The goal is the distances between clusters to be high while the
+ * representative point density in the areas between them are low.
+ *
+ * @return a double
+ */
+ public double separation() {
+ double minDistanceSum = 0;
+ Map<Integer,Map<Integer,Double>> distances = minimumDistances();
+ for (Map<Integer,Double> map : distances.values()) {
+ for (Double dist : map.values()) {
+ if (!Double.isInfinite(dist)) {
+ minDistanceSum += dist * 2; // account for other half of calculated triangular minimumDistances matrix
+ }
+ }
+ }
+ return minDistanceSum / (1.0 + interClusterDensity());
+ }
+
+ /**
+ * This function evaluates the average density of points in the regions between clusters (eqn 1). The goal is the
+ * density in the area between clusters to be significant low.
+ *
+ * @return a double
+ */
+ public double interClusterDensity() {
+ if (interClusterDensity != null) {
+ return interClusterDensity;
+ }
+ double sum = 0.0;
+ int count = 0;
+ Map<Integer,Map<Integer,Double>> distances = interClusterDensities();
+ for (Map<Integer,Double> row : distances.values()) {
+ for (Double density : row.values()) {
+ if (!Double.isNaN(density)) {
+ sum += density;
+ count++;
+ }
+ }
+ }
+ log.debug("interClusterDensity={}", sum);
+ interClusterDensity = sum / count;
+ return interClusterDensity;
+ }
+
+ /**
+ * The average density within clusters is defined as the percentage of representative points that reside in the
+ * neighborhood of the clusters' centers. The goal is the density within clusters to be significantly high. (eqn 5)
+ *
+ * @return a Vector of the intra-densities of each clusterId
+ */
+ public Vector intraClusterDensities() {
+ Vector densities = new RandomAccessSparseVector(Integer.MAX_VALUE);
+ // compute the average standard deviation of the clusters
+ double stdev = 0.0;
+ for (Integer cI : representativePoints.keySet()) {
+ stdev += stDevs.get(cI);
+ }
+ int c = representativePoints.size();
+ stdev /= c;
+ for (Cluster cluster : clusters) {
+ Integer cI = cluster.getId();
+ List<VectorWritable> repPtsI = representativePoints.get(cI);
+ int r = repPtsI.size();
+ double sumJ = 0.0;
+ // compute the term density (eqn 6)
+ for (VectorWritable pt : repPtsI) {
+ // compute f(x, vIJ) (eqn 7)
+ Vector repJ = pt.get();
+ double densityIJ = measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0;
+ // accumulate sumJ
+ sumJ += densityIJ / stdev;
+ }
+ densities.set(cI, sumJ / r);
+ }
+ return densities;
+ }
+
+ /**
+ * Calculate and cache the distances between the clusters' closest representative points. Also cache the indices of
+ * the closest representative points used for later use
+ *
+ * @return a Map<Integer,Vector> of the closest distances, keyed by clusterId
+ */
+ private Map<Integer,Map<Integer,Double>> minimumDistances() {
+ if (minimumDistances != null) {
+ return minimumDistances;
+ }
+ minimumDistances = new TreeMap<>();
+ closestRepPointIndices = new TreeMap<>();
+ for (int i = 0; i < clusters.size(); i++) {
+ Integer cI = clusters.get(i).getId();
+ Map<Integer,Double> map = new TreeMap<>();
+ Map<Integer,int[]> treeMap = new TreeMap<>();
+ closestRepPointIndices.put(cI, treeMap);
+ minimumDistances.put(cI, map);
+ List<VectorWritable> closRepI = representativePoints.get(cI);
+ for (int j = i + 1; j < clusters.size(); j++) {
+ // find min{d(closRepI, closRepJ)}
+ Integer cJ = clusters.get(j).getId();
+ List<VectorWritable> closRepJ = representativePoints.get(cJ);
+ double minDistance = Double.MAX_VALUE;
+ int[] midPointIndices = null;
+ for (int xI = 0; xI < closRepI.size(); xI++) {
+ VectorWritable aRepI = closRepI.get(xI);
+ for (int xJ = 0; xJ < closRepJ.size(); xJ++) {
+ VectorWritable aRepJ = closRepJ.get(xJ);
+ double distance = measure.distance(aRepI.get(), aRepJ.get());
+ if (distance < minDistance) {
+ minDistance = distance;
+ midPointIndices = new int[] {xI, xJ};
+ }
+ }
+ }
+ map.put(cJ, minDistance);
+ treeMap.put(cJ, midPointIndices);
+ }
+ }
+ return minimumDistances;
+ }
+
+ private double minimumDistance(int cI, int cJ) {
+ Map<Integer,Double> distances = minimumDistances().get(cI);
+ if (distances != null) {
+ return distances.get(cJ);
+ } else {
+ return minimumDistances().get(cJ).get(cI);
+ }
+ }
+
+ private Vector midpointVector(int cI, int cJ) {
+ Map<Integer,Double> distances = minimumDistances().get(cI);
+ if (distances != null) {
+ int[] ks = closestRepPointIndices.get(cI).get(cJ);
+ if (ks == null) {
+ return null;
+ }
+ return representativePoints.get(cI).get(ks[0]).get().plus(representativePoints.get(cJ).get(ks[1]).get())
+ .divide(2);
+ } else {
+ int[] ks = closestRepPointIndices.get(cJ).get(cI);
+ if (ks == null) {
+ return null;
+ }
+ return representativePoints.get(cJ).get(ks[1]).get().plus(representativePoints.get(cI).get(ks[0]).get())
+ .divide(2);
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java
new file mode 100644
index 0000000..6a2b376
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.conversion;
+
+import java.io.IOException;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class converts text files containing space-delimited floating point numbers into
+ * Mahout sequence files of VectorWritable suitable for input to the clustering jobs in
+ * particular, and any Mahout job requiring this input in general.
+ *
+ */
+public final class InputDriver {
+
+ private static final Logger log = LoggerFactory.getLogger(InputDriver.class);
+
+ private InputDriver() {
+ }
+
+ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
+ Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
+ Option vectorOpt = obuilder.withLongName("vector").withRequired(false).withArgument(
+ abuilder.withName("v").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The vector implementation to use.").withShortName("v").create();
+
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
+ vectorOpt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ Path input = new Path(cmdLine.getValue(inputOpt, "testdata").toString());
+ Path output = new Path(cmdLine.getValue(outputOpt, "output").toString());
+ String vectorClassName = cmdLine.getValue(vectorOpt,
+ "org.apache.mahout.math.RandomAccessSparseVector").toString();
+ runJob(input, output, vectorClassName);
+ } catch (OptionException e) {
+ log.error("Exception parsing command line: ", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+
+ public static void runJob(Path input, Path output, String vectorClassName)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ Configuration conf = new Configuration();
+ conf.set("vector.implementation.class.name", vectorClassName);
+ Job job = new Job(conf, "Input Driver running over input: " + input);
+
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(VectorWritable.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(InputMapper.class);
+ job.setNumReduceTasks(0);
+ job.setJarByClass(InputDriver.class);
+
+ FileInputFormat.addInputPath(job, input);
+ FileOutputFormat.setOutputPath(job, output);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java
new file mode 100644
index 0000000..e4c72c6
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.conversion;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.regex.Pattern;
+
+public class InputMapper extends Mapper<LongWritable, Text, Text, VectorWritable> {
+
+ private static final Pattern SPACE = Pattern.compile(" ");
+
+ private Constructor<?> constructor;
+
+ @Override
+ protected void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException {
+
+ String[] numbers = SPACE.split(values.toString());
+ // sometimes there are multiple separator spaces
+ Collection<Double> doubles = new ArrayList<>();
+ for (String value : numbers) {
+ if (!value.isEmpty()) {
+ doubles.add(Double.valueOf(value));
+ }
+ }
+ // ignore empty lines in data file
+ if (!doubles.isEmpty()) {
+ try {
+ Vector result = (Vector) constructor.newInstance(doubles.size());
+ int index = 0;
+ for (Double d : doubles) {
+ result.set(index++, d);
+ }
+ VectorWritable vectorWritable = new VectorWritable(result);
+ context.write(new Text(String.valueOf(index)), vectorWritable);
+
+ } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Configuration conf = context.getConfiguration();
+ String vectorImplClassName = conf.get("vector.implementation.class.name");
+ try {
+ Class<? extends Vector> outputClass = conf.getClassByName(vectorImplClassName).asSubclass(Vector.class);
+ constructor = outputClass.getConstructor(int.class);
+ } catch (NoSuchMethodException | ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+}

r***@apache.org

2018-06-27 14:52:17 UTC

Permalink

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java
new file mode 100644
index 0000000..b220993
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java
@@ -0,0 +1,465 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.cassandra;
+
+import com.google.common.base.Preconditions;
+import me.prettyprint.cassandra.model.HColumnImpl;
+import me.prettyprint.cassandra.serializers.BytesArraySerializer;
+import me.prettyprint.cassandra.serializers.FloatSerializer;
+import me.prettyprint.cassandra.serializers.LongSerializer;
+import me.prettyprint.cassandra.service.OperationType;
+import me.prettyprint.hector.api.Cluster;
+import me.prettyprint.hector.api.ConsistencyLevelPolicy;
+import me.prettyprint.hector.api.HConsistencyLevel;
+import me.prettyprint.hector.api.Keyspace;
+import me.prettyprint.hector.api.beans.ColumnSlice;
+import me.prettyprint.hector.api.beans.HColumn;
+import me.prettyprint.hector.api.factory.HFactory;
+import me.prettyprint.hector.api.mutation.Mutator;
+import me.prettyprint.hector.api.query.ColumnQuery;
+import me.prettyprint.hector.api.query.CountQuery;
+import me.prettyprint.hector.api.query.SliceQuery;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+import java.io.Closeable;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
+
+/**
+ * A {@link DataModel} based on a Cassandra keyspace. By default it uses keyspace "recommender" but this
+ * can be configured. Create the keyspace before using this class; this can be done on the Cassandra command
+ * line with a command linke {@code create keyspace recommender;}.
+ *
+ * Within the keyspace, this model uses four column families:
+ *
+ * First, it uses a column family called "users". This is keyed by the user ID as an 8-byte long.
+ * It contains a column for every preference the user expresses. The column name is item ID, again as
+ * an 8-byte long, and value is a floating point value represnted as an IEEE 32-bit floating poitn value.
+ *
+ * It uses an analogous column family called "items" for the same data, but keyed by item ID rather
+ * than user ID. In this column family, column names are user IDs instead.
+ *
+ * It uses a column family called "userIDs" as well, with an identical schema. It has one row under key
+ * 0. IT contains a column for every user ID in th emodel. It has no values.
+ *
+ * Finally it also uses an analogous column family "itemIDs" containing item IDs.
+ *
+ * Each of these four column families needs to be created ahead of time. Again the
+ * Cassandra CLI can be used to do so, with commands like {@code create column family users;}.
+ *
+ * Note that this thread uses a long-lived Cassandra client which will run until terminated. You
+ * must {@link #close()} this implementation when done or the JVM will not terminate.
+ *
+ * This implementation still relies heavily on reading data into memory and caching,
+ * as it remains too data-intensive to be effective even against Cassandra. It will take some time to
+ * "warm up" as the first few requests will block loading user and item data into caches. This is still going
+ * to send a great deal of query traffic to Cassandra. It would be advisable to employ caching wrapper
+ * classes in your implementation, like {@link org.apache.mahout.cf.taste.impl.recommender.CachingRecommender}
+ * or {@link org.apache.mahout.cf.taste.impl.similarity.CachingItemSimilarity}.
+ */
+public final class CassandraDataModel implements DataModel, Closeable {
+
+ /** Default Cassandra host. Default: localhost */
+ private static final String DEFAULT_HOST = "localhost";
+
+ /** Default Cassandra port. Default: 9160 */
+ private static final int DEFAULT_PORT = 9160;
+
+ /** Default Cassandra keyspace. Default: recommender */
+ private static final String DEFAULT_KEYSPACE = "recommender";
+
+ static final String USERS_CF = "users";
+ static final String ITEMS_CF = "items";
+ static final String USER_IDS_CF = "userIDs";
+ static final String ITEM_IDS_CF = "itemIDs";
+ private static final long ID_ROW_KEY = 0L;
+ private static final byte[] EMPTY = new byte[0];
+
+ private final Cluster cluster;
+ private final Keyspace keyspace;
+ private final Cache<Long,PreferenceArray> userCache;
+ private final Cache<Long,PreferenceArray> itemCache;
+ private final Cache<Long,FastIDSet> itemIDsFromUserCache;
+ private final Cache<Long,FastIDSet> userIDsFromItemCache;
+ private final AtomicReference<Integer> userCountCache;
+ private final AtomicReference<Integer> itemCountCache;
+
+ /**
+ * Uses the standard Cassandra host and port (localhost:9160), and keyspace name ("recommender").
+ */
+ public CassandraDataModel() {
+ this(DEFAULT_HOST, DEFAULT_PORT, DEFAULT_KEYSPACE);
+ }
+
+ /**
+ * @param host Cassandra server host name
+ * @param port Cassandra server port
+ * @param keyspaceName name of Cassandra keyspace to use
+ */
+ public CassandraDataModel(String host, int port, String keyspaceName) {
+
+ Preconditions.checkNotNull(host);
+ Preconditions.checkArgument(port > 0, "port must be greater then 0!");
+ Preconditions.checkNotNull(keyspaceName);
+
+ cluster = HFactory.getOrCreateCluster(CassandraDataModel.class.getSimpleName(), host + ':' + port);
+ keyspace = HFactory.createKeyspace(keyspaceName, cluster);
+ keyspace.setConsistencyLevelPolicy(new OneConsistencyLevelPolicy());
+
+ userCache = new Cache<>(new UserPrefArrayRetriever(), 1 << 20);
+ itemCache = new Cache<>(new ItemPrefArrayRetriever(), 1 << 20);
+ itemIDsFromUserCache = new Cache<>(new ItemIDsFromUserRetriever(), 1 << 20);
+ userIDsFromItemCache = new Cache<>(new UserIDsFromItemRetriever(), 1 << 20);
+ userCountCache = new AtomicReference<>(null);
+ itemCountCache = new AtomicReference<>(null);
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() {
+ SliceQuery<Long,Long,?> query = buildNoValueSliceQuery(USER_IDS_CF);
+ query.setKey(ID_ROW_KEY);
+ FastIDSet userIDs = new FastIDSet();
+ for (HColumn<Long,?> userIDColumn : query.execute().get().getColumns()) {
+ userIDs.add(userIDColumn.getName());
+ }
+ return userIDs.iterator();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+ return userCache.get(userID);
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ return itemIDsFromUserCache.get(userID);
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() {
+ SliceQuery<Long,Long,?> query = buildNoValueSliceQuery(ITEM_IDS_CF);
+ query.setKey(ID_ROW_KEY);
+ FastIDSet itemIDs = new FastIDSet();
+ for (HColumn<Long,?> itemIDColumn : query.execute().get().getColumns()) {
+ itemIDs.add(itemIDColumn.getName());
+ }
+ return itemIDs.iterator();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ return itemCache.get(itemID);
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) {
+ ColumnQuery<Long,Long,Float> query =
+ HFactory.createColumnQuery(keyspace, LongSerializer.get(), LongSerializer.get(), FloatSerializer.get());
+ query.setColumnFamily(USERS_CF);
+ query.setKey(userID);
+ query.setName(itemID);
+ HColumn<Long,Float> column = query.execute().get();
+ return column == null ? null : column.getValue();
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) {
+ ColumnQuery<Long,Long,?> query =
+ HFactory.createColumnQuery(keyspace, LongSerializer.get(), LongSerializer.get(), BytesArraySerializer.get());
+ query.setColumnFamily(USERS_CF);
+ query.setKey(userID);
+ query.setName(itemID);
+ HColumn<Long,?> result = query.execute().get();
+ return result == null ? null : result.getClock();
+ }
+
+ @Override
+ public int getNumItems() {
+ Integer itemCount = itemCountCache.get();
+ if (itemCount == null) {
+ CountQuery<Long,Long> countQuery =
+ HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get());
+ countQuery.setKey(ID_ROW_KEY);
+ countQuery.setColumnFamily(ITEM_IDS_CF);
+ countQuery.setRange(null, null, Integer.MAX_VALUE);
+ itemCount = countQuery.execute().get();
+ itemCountCache.set(itemCount);
+ }
+ return itemCount;
+ }
+
+ @Override
+ public int getNumUsers() {
+ Integer userCount = userCountCache.get();
+ if (userCount == null) {
+ CountQuery<Long,Long> countQuery =
+ HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get());
+ countQuery.setKey(ID_ROW_KEY);
+ countQuery.setColumnFamily(USER_IDS_CF);
+ countQuery.setRange(null, null, Integer.MAX_VALUE);
+ userCount = countQuery.execute().get();
+ userCountCache.set(userCount);
+ }
+ return userCount;
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ /*
+ CountQuery<Long,Long> query = HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get());
+ query.setColumnFamily(ITEMS_CF);
+ query.setKey(itemID);
+ query.setRange(null, null, Integer.MAX_VALUE);
+ return query.execute().get();
+ */
+ return userIDsFromItemCache.get(itemID).size();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ FastIDSet userIDs1 = userIDsFromItemCache.get(itemID1);
+ FastIDSet userIDs2 = userIDsFromItemCache.get(itemID2);
+ return userIDs1.size() < userIDs2.size()
+ ? userIDs2.intersectionSize(userIDs1)
+ : userIDs1.intersectionSize(userIDs2);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) {
+
+ if (Float.isNaN(value)) {
+ value = 1.0f;
+ }
+
+ long now = System.currentTimeMillis();
+
+ Mutator<Long> mutator = HFactory.createMutator(keyspace, LongSerializer.get());
+
+ HColumn<Long,Float> itemForUsers = new HColumnImpl<>(LongSerializer.get(), FloatSerializer.get());
+ itemForUsers.setName(itemID);
+ itemForUsers.setClock(now);
+ itemForUsers.setValue(value);
+ mutator.addInsertion(userID, USERS_CF, itemForUsers);
+
+ HColumn<Long,Float> userForItems = new HColumnImpl<>(LongSerializer.get(), FloatSerializer.get());
+ userForItems.setName(userID);
+ userForItems.setClock(now);
+ userForItems.setValue(value);
+ mutator.addInsertion(itemID, ITEMS_CF, userForItems);
+
+ HColumn<Long,byte[]> userIDs = new HColumnImpl<>(LongSerializer.get(), BytesArraySerializer.get());
+ userIDs.setName(userID);
+ userIDs.setClock(now);
+ userIDs.setValue(EMPTY);
+ mutator.addInsertion(ID_ROW_KEY, USER_IDS_CF, userIDs);
+
+ HColumn<Long,byte[]> itemIDs = new HColumnImpl<>(LongSerializer.get(), BytesArraySerializer.get());
+ itemIDs.setName(itemID);
+ itemIDs.setClock(now);
+ itemIDs.setValue(EMPTY);
+ mutator.addInsertion(ID_ROW_KEY, ITEM_IDS_CF, itemIDs);
+
+ mutator.execute();
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) {
+ Mutator<Long> mutator = HFactory.createMutator(keyspace, LongSerializer.get());
+ mutator.addDeletion(userID, USERS_CF, itemID, LongSerializer.get());
+ mutator.addDeletion(itemID, ITEMS_CF, userID, LongSerializer.get());
+ mutator.execute();
+ // Not deleting from userIDs, itemIDs though
+ }
+
+ /**
+ * @return true
+ */
+ @Override
+ public boolean hasPreferenceValues() {
+ return true;
+ }
+
+ /**
+ * @return Float#NaN
+ */
+ @Override
+ public float getMaxPreference() {
+ return Float.NaN;
+ }
+
+ /**
+ * @return Float#NaN
+ */
+ @Override
+ public float getMinPreference() {
+ return Float.NaN;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ userCache.clear();
+ itemCache.clear();
+ userIDsFromItemCache.clear();
+ itemIDsFromUserCache.clear();
+ userCountCache.set(null);
+ itemCountCache.set(null);
+ }
+
+ @Override
+ public String toString() {
+ return "CassandraDataModel[" + keyspace + ']';
+ }
+
+ @Override
+ public void close() {
+ HFactory.shutdownCluster(cluster);
+ }
+
+
+ private SliceQuery<Long,Long,byte[]> buildNoValueSliceQuery(String cf) {
+ SliceQuery<Long,Long,byte[]> query =
+ HFactory.createSliceQuery(keyspace, LongSerializer.get(), LongSerializer.get(), BytesArraySerializer.get());
+ query.setColumnFamily(cf);
+ query.setRange(null, null, false, Integer.MAX_VALUE);
+ return query;
+ }
+
+ private SliceQuery<Long,Long,Float> buildValueSliceQuery(String cf) {
+ SliceQuery<Long,Long,Float> query =
+ HFactory.createSliceQuery(keyspace, LongSerializer.get(), LongSerializer.get(), FloatSerializer.get());
+ query.setColumnFamily(cf);
+ query.setRange(null, null, false, Integer.MAX_VALUE);
+ return query;
+ }
+
+
+ private static final class OneConsistencyLevelPolicy implements ConsistencyLevelPolicy {
+ @Override
+ public HConsistencyLevel get(OperationType op) {
+ return HConsistencyLevel.ONE;
+ }
+
+ @Override
+ public HConsistencyLevel get(OperationType op, String cfName) {
+ return HConsistencyLevel.ONE;
+ }
+ }
+
+ private final class UserPrefArrayRetriever implements Retriever<Long, PreferenceArray> {
+ @Override
+ public PreferenceArray get(Long userID) throws TasteException {
+ SliceQuery<Long,Long,Float> query = buildValueSliceQuery(USERS_CF);
+ query.setKey(userID);
+
+ ColumnSlice<Long,Float> result = query.execute().get();
+ if (result == null) {
+ throw new NoSuchUserException(userID);
+ }
+ List<HColumn<Long,Float>> itemIDColumns = result.getColumns();
+ if (itemIDColumns.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+ int size = itemIDColumns.size();
+ PreferenceArray prefs = new GenericUserPreferenceArray(size);
+ prefs.setUserID(0, userID);
+ for (int i = 0; i < size; i++) {
+ HColumn<Long,Float> itemIDColumn = itemIDColumns.get(i);
+ prefs.setItemID(i, itemIDColumn.getName());
+ prefs.setValue(i, itemIDColumn.getValue());
+ }
+ return prefs;
+ }
+ }
+
+ private final class ItemPrefArrayRetriever implements Retriever<Long, PreferenceArray> {
+ @Override
+ public PreferenceArray get(Long itemID) throws TasteException {
+ SliceQuery<Long,Long,Float> query = buildValueSliceQuery(ITEMS_CF);
+ query.setKey(itemID);
+ ColumnSlice<Long,Float> result = query.execute().get();
+ if (result == null) {
+ throw new NoSuchItemException(itemID);
+ }
+ List<HColumn<Long,Float>> userIDColumns = result.getColumns();
+ if (userIDColumns.isEmpty()) {
+ throw new NoSuchItemException(itemID);
+ }
+ int size = userIDColumns.size();
+ PreferenceArray prefs = new GenericItemPreferenceArray(size);
+ prefs.setItemID(0, itemID);
+ for (int i = 0; i < size; i++) {
+ HColumn<Long,Float> userIDColumn = userIDColumns.get(i);
+ prefs.setUserID(i, userIDColumn.getName());
+ prefs.setValue(i, userIDColumn.getValue());
+ }
+ return prefs;
+ }
+ }
+
+ private final class UserIDsFromItemRetriever implements Retriever<Long, FastIDSet> {
+ @Override
+ public FastIDSet get(Long itemID) throws TasteException {
+ SliceQuery<Long,Long,byte[]> query = buildNoValueSliceQuery(ITEMS_CF);
+ query.setKey(itemID);
+ ColumnSlice<Long,byte[]> result = query.execute().get();
+ if (result == null) {
+ throw new NoSuchItemException(itemID);
+ }
+ List<HColumn<Long,byte[]>> columns = result.getColumns();
+ FastIDSet userIDs = new FastIDSet(columns.size());
+ for (HColumn<Long,?> userIDColumn : columns) {
+ userIDs.add(userIDColumn.getName());
+ }
+ return userIDs;
+ }
+ }
+
+ private final class ItemIDsFromUserRetriever implements Retriever<Long, FastIDSet> {
+ @Override
+ public FastIDSet get(Long userID) throws TasteException {
+ SliceQuery<Long,Long,byte[]> query = buildNoValueSliceQuery(USERS_CF);
+ query.setKey(userID);
+ FastIDSet itemIDs = new FastIDSet();
+ ColumnSlice<Long,byte[]> result = query.execute().get();
+ if (result == null) {
+ throw new NoSuchUserException(userID);
+ }
+ List<HColumn<Long,byte[]>> columns = result.getColumns();
+ if (columns.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+ for (HColumn<Long,?> itemIDColumn : columns) {
+ itemIDs.add(itemIDColumn.getName());
+ }
+ return itemIDs;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java
new file mode 100644
index 0000000..9735ffe
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/hbase/HBaseDataModel.java
@@ -0,0 +1,497 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.hbase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.client.Delete;
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.HTableFactory;
+import org.apache.hadoop.hbase.client.HTableInterface;
+import org.apache.hadoop.hbase.client.HTablePool;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.filter.FilterList;
+import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
+import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+
+/**
+ * Naive approach of storing one preference as one value in the table.
+ * Preferences are indexed as (user, item) and (item, user) for O(1) lookups.
+ *
+ * The default table name is "taste", this can be set through a constructor
+ * argument. Each row has a value starting with "i" or "u" followed by the
+ * actual id encoded as a big endian long.
+ *
+ * E.g., "u\x00\x00\x00\x00\x00\x00\x04\xd2" is user 1234L
+ *
+ * There are two column families: "users" and "items".
+ *
+ * The "users" column family holds user->item preferences. Each userID is the
+ * column qualifier and the value is the preference.
+ *
+ * The "items" column fmaily holds item->user preferences. Each itemID is the
+ * column qualifier and the value is the preference.
+ *
+ * User IDs and item IDs are cached in a FastIDSet since it requires a full
+ * table scan to build these sets. Preferences are not cached since they
+ * are pretty cheap lookups in HBase (also caching the Preferences defeats
+ * the purpose of a scalable storage engine like HBase).
+ */
+public final class HBaseDataModel implements DataModel, Closeable {
+
+ private static final Logger log = LoggerFactory.getLogger(HBaseDataModel.class);
+
+ private static final String DEFAULT_TABLE = "taste";
+ private static final byte[] USERS_CF = Bytes.toBytes("users");
+ private static final byte[] ITEMS_CF = Bytes.toBytes("items");
+
+ private final HTablePool pool;
+ private final String tableName;
+
+ // Cache of user and item ids
+ private volatile FastIDSet itemIDs;
+ private volatile FastIDSet userIDs;
+
+ public HBaseDataModel(String zkConnect) throws IOException {
+ this(zkConnect, DEFAULT_TABLE);
+ }
+
+ public HBaseDataModel(String zkConnect, String tableName) throws IOException {
+ log.info("Using HBase table {}", tableName);
+ Configuration conf = HBaseConfiguration.create();
+ conf.set("hbase.zookeeper.quorum", zkConnect);
+ HTableFactory tableFactory = new HTableFactory();
+ this.pool = new HTablePool(conf, 8, tableFactory);
+ this.tableName = tableName;
+
+ bootstrap(conf);
+ // Warm the cache
+ refresh(null);
+ }
+
+ public HBaseDataModel(HTablePool pool, String tableName, Configuration conf) throws IOException {
+ log.info("Using HBase table {}", tableName);
+ this.pool = pool;
+ this.tableName = tableName;
+
+ bootstrap(conf);
+
+ // Warm the cache
+ refresh(null);
+ }
+
+ public String getTableName() {
+ return tableName;
+ }
+
+ /**
+ * Create the table if it doesn't exist
+ */
+ private void bootstrap(Configuration conf) throws IOException {
+ HTableDescriptor tDesc = new HTableDescriptor(Bytes.toBytes(tableName));
+ tDesc.addFamily(new HColumnDescriptor(USERS_CF));
+ tDesc.addFamily(new HColumnDescriptor(ITEMS_CF));
+ try (HBaseAdmin admin = new HBaseAdmin(conf)) {
+ admin.createTable(tDesc);
+ log.info("Created table {}", tableName);
+ }
+ }
+
+ /**
+ * Prefix a user id with "u" and convert to byte[]
+ */
+ private static byte[] userToBytes(long userID) {
+ ByteBuffer bb = ByteBuffer.allocate(9);
+ bb.put((byte)0x75); // The letter "u"
+ bb.putLong(userID);
+ return bb.array();
+ }
+
+ /**
+ * Prefix an item id with "i" and convert to byte[]
+ */
+ private static byte[] itemToBytes(long itemID) {
+ ByteBuffer bb = ByteBuffer.allocate(9);
+ bb.put((byte)0x69); // The letter "i"
+ bb.putLong(itemID);
+ return bb.array();
+ }
+
+ /**
+ * Extract the id out of a prefix byte[] id
+ */
+ private static long bytesToUserOrItemID(byte[] ba) {
+ ByteBuffer bb = ByteBuffer.wrap(ba);
+ return bb.getLong(1);
+ }
+
+ /* DataModel interface */
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() {
+ return userIDs.iterator();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+ Result result;
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ Get get = new Get(userToBytes(userID));
+ get.addFamily(ITEMS_CF);
+ result = table.get(get);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to retrieve user preferences from HBase", e);
+ }
+
+ if (result.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+
+ SortedMap<byte[], byte[]> families = result.getFamilyMap(ITEMS_CF);
+ PreferenceArray prefs = new GenericUserPreferenceArray(families.size());
+ prefs.setUserID(0, userID);
+ int i = 0;
+ for (Map.Entry<byte[], byte[]> entry : families.entrySet()) {
+ prefs.setItemID(i, Bytes.toLong(entry.getKey()));
+ prefs.setValue(i, Bytes.toFloat(entry.getValue()));
+ i++;
+ }
+ return prefs;
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ Result result;
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ Get get = new Get(userToBytes(userID));
+ get.addFamily(ITEMS_CF);
+ result = table.get(get);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to retrieve item IDs from HBase", e);
+ }
+
+ if (result.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+
+ SortedMap<byte[],byte[]> families = result.getFamilyMap(ITEMS_CF);
+ FastIDSet ids = new FastIDSet(families.size());
+ for (byte[] family : families.keySet()) {
+ ids.add(Bytes.toLong(family));
+ }
+ return ids;
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() {
+ return itemIDs.iterator();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ Result result;
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ Get get = new Get(itemToBytes(itemID));
+ get.addFamily(USERS_CF);
+ result = table.get(get);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to retrieve item preferences from HBase", e);
+ }
+
+ if (result.isEmpty()) {
+ throw new NoSuchItemException(itemID);
+ }
+
+ SortedMap<byte[], byte[]> families = result.getFamilyMap(USERS_CF);
+ PreferenceArray prefs = new GenericItemPreferenceArray(families.size());
+ prefs.setItemID(0, itemID);
+ int i = 0;
+ for (Map.Entry<byte[], byte[]> entry : families.entrySet()) {
+ prefs.setUserID(i, Bytes.toLong(entry.getKey()));
+ prefs.setValue(i, Bytes.toFloat(entry.getValue()));
+ i++;
+ }
+ return prefs;
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ Result result;
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ Get get = new Get(userToBytes(userID));
+ get.addColumn(ITEMS_CF, Bytes.toBytes(itemID));
+ result = table.get(get);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to retrieve user preferences from HBase", e);
+ }
+
+ if (result.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+
+ if (result.containsColumn(ITEMS_CF, Bytes.toBytes(itemID))) {
+ return Bytes.toFloat(result.getValue(ITEMS_CF, Bytes.toBytes(itemID)));
+ } else {
+ return null;
+ }
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ Result result;
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ Get get = new Get(userToBytes(userID));
+ get.addColumn(ITEMS_CF, Bytes.toBytes(itemID));
+ result = table.get(get);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to retrieve user preferences from HBase", e);
+ }
+
+ if (result.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+
+ if (result.containsColumn(ITEMS_CF, Bytes.toBytes(itemID))) {
+ KeyValue kv = result.getColumnLatest(ITEMS_CF, Bytes.toBytes(itemID));
+ return kv.getTimestamp();
+ } else {
+ return null;
+ }
+ }
+
+ @Override
+ public int getNumItems() {
+ return itemIDs.size();
+ }
+
+ @Override
+ public int getNumUsers() {
+ return userIDs.size();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ PreferenceArray prefs = getPreferencesForItem(itemID);
+ return prefs.length();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ Result[] results;
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ List<Get> gets = new ArrayList<>(2);
+ gets.add(new Get(itemToBytes(itemID1)));
+ gets.add(new Get(itemToBytes(itemID2)));
+ gets.get(0).addFamily(USERS_CF);
+ gets.get(1).addFamily(USERS_CF);
+ results = table.get(gets);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to retrieve item preferences from HBase", e);
+ }
+
+ if (results[0].isEmpty()) {
+ throw new NoSuchItemException(itemID1);
+ }
+ if (results[1].isEmpty()) {
+ throw new NoSuchItemException(itemID2);
+ }
+
+ // First item
+ Result result = results[0];
+ SortedMap<byte[], byte[]> families = result.getFamilyMap(USERS_CF);
+ FastIDSet idSet1 = new FastIDSet(families.size());
+ for (byte[] id : families.keySet()) {
+ idSet1.add(Bytes.toLong(id));
+ }
+
+ // Second item
+ result = results[1];
+ families = result.getFamilyMap(USERS_CF);
+ FastIDSet idSet2 = new FastIDSet(families.size());
+ for (byte[] id : families.keySet()) {
+ idSet2.add(Bytes.toLong(id));
+ }
+
+ return idSet1.intersectionSize(idSet2);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ List<Put> puts = new ArrayList<>(2);
+ puts.add(new Put(userToBytes(userID)));
+ puts.add(new Put(itemToBytes(itemID)));
+ puts.get(0).add(ITEMS_CF, Bytes.toBytes(itemID), Bytes.toBytes(value));
+ puts.get(1).add(USERS_CF, Bytes.toBytes(userID), Bytes.toBytes(value));
+ table.put(puts);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to store preference in HBase", e);
+ }
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ try {
+ HTableInterface table = pool.getTable(tableName);
+ List<Delete> deletes = new ArrayList<>(2);
+ deletes.add(new Delete(userToBytes(userID)));
+ deletes.add(new Delete(itemToBytes(itemID)));
+ deletes.get(0).deleteColumns(ITEMS_CF, Bytes.toBytes(itemID));
+ deletes.get(1).deleteColumns(USERS_CF, Bytes.toBytes(userID));
+ table.delete(deletes);
+ table.close();
+ } catch (IOException e) {
+ throw new TasteException("Failed to remove preference from HBase", e);
+ }
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return true;
+ }
+
+ @Override
+ public float getMaxPreference() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public float getMinPreference() {
+ throw new UnsupportedOperationException();
+ }
+
+ /* Closeable interface */
+
+ @Override
+ public void close() throws IOException {
+ pool.close();
+ }
+
+ /* Refreshable interface */
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ if (alreadyRefreshed == null || !alreadyRefreshed.contains(this)) {
+ try {
+ log.info("Refreshing item and user ID caches");
+ long t1 = System.currentTimeMillis();
+ refreshItemIDs();
+ refreshUserIDs();
+ long t2 = System.currentTimeMillis();
+ log.info("Finished refreshing caches in {} ms", t2 - t1);
+ } catch (IOException e) {
+ throw new IllegalStateException("Could not reload DataModel", e);
+ }
+ }
+ }
+
+ /*
+ * Refresh the item id cache. Warning: this does a large table scan
+ */
+ private synchronized void refreshItemIDs() throws IOException {
+ // Get the list of item ids
+ HTableInterface table = pool.getTable(tableName);
+ Scan scan = new Scan(new byte[]{0x69}, new byte[]{0x70});
+ scan.setFilter(new FilterList(FilterList.Operator.MUST_PASS_ALL, new KeyOnlyFilter(), new FirstKeyOnlyFilter()));
+ ResultScanner scanner = table.getScanner(scan);
+ Collection<Long> ids = new LinkedList<>();
+ for (Result result : scanner) {
+ ids.add(bytesToUserOrItemID(result.getRow()));
+ }
+ table.close();
+
+ // Copy into FastIDSet
+ FastIDSet itemIDs = new FastIDSet(ids.size());
+ for (long l : ids) {
+ itemIDs.add(l);
+ }
+
+ // Swap with the active
+ this.itemIDs = itemIDs;
+ }
+
+ /*
+ * Refresh the user id cache. Warning: this does a large table scan
+ */
+ private synchronized void refreshUserIDs() throws IOException {
+ // Get the list of user ids
+ HTableInterface table = pool.getTable(tableName);
+ Scan scan = new Scan(new byte[]{0x75}, new byte[]{0x76});
+ scan.setFilter(new FilterList(FilterList.Operator.MUST_PASS_ALL, new KeyOnlyFilter(), new FirstKeyOnlyFilter()));
+ ResultScanner scanner = table.getScanner(scan);
+ Collection<Long> ids = new LinkedList<>();
+ for (Result result : scanner) {
+ ids.add(bytesToUserOrItemID(result.getRow()));
+ }
+ table.close();
+
+ // Copy into FastIDSet
+ FastIDSet userIDs = new FastIDSet(ids.size());
+ for (long l : ids) {
+ userIDs.add(l);
+ }
+
+ // Swap with the active
+ this.userIDs = userIDs;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java
new file mode 100644
index 0000000..79ca1ac
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractBooleanPrefJDBCDataModel.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.model.BooleanPreference;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+public abstract class AbstractBooleanPrefJDBCDataModel extends AbstractJDBCDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractBooleanPrefJDBCDataModel.class);
+
+ static final String NO_SUCH_COLUMN = "NO_SUCH_COLUMN";
+
+ private final String setPreferenceSQL;
+
+ protected AbstractBooleanPrefJDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String preferenceColumn,
+ String getPreferenceSQL,
+ String getPreferenceTimeSQL,
+ String getUserSQL,
+ String getAllUsersSQL,
+ String getNumItemsSQL,
+ String getNumUsersSQL,
+ String setPreferenceSQL,
+ String removePreferenceSQL,
+ String getUsersSQL,
+ String getItemsSQL,
+ String getPrefsForItemSQL,
+ String getNumPreferenceForItemSQL,
+ String getNumPreferenceForItemsSQL,
+ String getMaxPreferenceSQL,
+ String getMinPreferenceSQL) {
+ super(dataSource,
+ preferenceTable,
+ userIDColumn,
+ itemIDColumn,
+ preferenceColumn,
+ getPreferenceSQL,
+ getPreferenceTimeSQL,
+ getUserSQL,
+ getAllUsersSQL,
+ getNumItemsSQL,
+ getNumUsersSQL,
+ setPreferenceSQL,
+ removePreferenceSQL,
+ getUsersSQL,
+ getItemsSQL,
+ getPrefsForItemSQL,
+ getNumPreferenceForItemSQL,
+ getNumPreferenceForItemsSQL,
+ getMaxPreferenceSQL,
+ getMinPreferenceSQL);
+ this.setPreferenceSQL = setPreferenceSQL;
+ }
+
+ @Override
+ protected Preference buildPreference(ResultSet rs) throws SQLException {
+ return new BooleanPreference(getLongColumn(rs, 1), getLongColumn(rs, 2));
+ }
+
+ @Override
+ String getSetPreferenceSQL() {
+ return setPreferenceSQL;
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+ log.debug("Setting preference for user {}, item {}", userID, itemID);
+
+ Connection conn = null;
+ PreparedStatement stmt = null;
+
+ try {
+ conn = getDataSource().getConnection();
+ stmt = conn.prepareStatement(setPreferenceSQL);
+ setLongParameter(stmt, 1, userID);
+ setLongParameter(stmt, 2, itemID);
+
+ log.debug("Executing SQL update: {}", setPreferenceSQL);
+ stmt.executeUpdate();
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while setting preference", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(null, stmt, conn);
+ }
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return false;
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return 1.0f;
+ }
+
+ @Override
+ public float getMinPreference() {
+ return 1.0f;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java
new file mode 100644
index 0000000..66f0a77
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java
@@ -0,0 +1,787 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+import org.apache.mahout.cf.taste.impl.common.jdbc.ResultSetIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.JDBCDataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import javax.sql.DataSource;
+
+/**
+ * 
+ * An abstract superclass for {@link JDBCDataModel} implementations, providing most of the common
+ * functionality that any such implementation would need.
+ * 
+ *
+ * 
+ * Performance will be a concern with any {@link JDBCDataModel}. There are going to be lots of
+ * simultaneous reads and some writes to one table. Make sure the table is set up optimally -- for example,
+ * you'll want to establish indexes.
+ * 
+ *
+ * 
+ * You'll also want to use connection pooling of some kind. Most J2EE containers like Tomcat provide
+ * connection pooling, so make sure the {@link DataSource} it exposes is using pooling. Outside a J2EE
+ * container, you can use packages like Jakarta's <a href="http://jakarta.apache.org/commons/dbcp/">DBCP</a>
+ * to create a {@link DataSource} on top of your database whose {@link Connection}s are pooled.
+ * 
+ */
+public abstract class AbstractJDBCDataModel extends AbstractJDBCComponent implements JDBCDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractJDBCDataModel.class);
+
+ public static final String DEFAULT_PREFERENCE_TABLE = "taste_preferences";
+ public static final String DEFAULT_USER_ID_COLUMN = "user_id";
+ public static final String DEFAULT_ITEM_ID_COLUMN = "item_id";
+ public static final String DEFAULT_PREFERENCE_COLUMN = "preference";
+ public static final String DEFAULT_PREFERENCE_TIME_COLUMN = "timestamp";
+
+ private final DataSource dataSource;
+ private final String preferenceTable;
+ private final String userIDColumn;
+ private final String itemIDColumn;
+ private final String preferenceColumn;
+ private final String getPreferenceSQL;
+ private final String getPreferenceTimeSQL;
+ private final String getUserSQL;
+ private final String getAllUsersSQL;
+ private final String getNumItemsSQL;
+ private final String getNumUsersSQL;
+ private final String setPreferenceSQL;
+ private final String removePreferenceSQL;
+ private final String getUsersSQL;
+ private final String getItemsSQL;
+ private final String getPrefsForItemSQL;
+ private final String getNumPreferenceForItemsSQL;
+ private final String getMaxPreferenceSQL;
+ private final String getMinPreferenceSQL;
+ private int cachedNumUsers;
+ private int cachedNumItems;
+ private final Cache<Long,Integer> itemPrefCounts;
+ private float maxPreference;
+ private float minPreference;
+
+ protected AbstractJDBCDataModel(DataSource dataSource,
+ String getPreferenceSQL,
+ String getPreferenceTimeSQL,
+ String getUserSQL,
+ String getAllUsersSQL,
+ String getNumItemsSQL,
+ String getNumUsersSQL,
+ String setPreferenceSQL,
+ String removePreferenceSQL,
+ String getUsersSQL,
+ String getItemsSQL,
+ String getPrefsForItemSQL,
+ String getNumPreferenceForItemSQL,
+ String getNumPreferenceForItemsSQL,
+ String getMaxPreferenceSQL,
+ String getMinPreferenceSQL) {
+ this(dataSource,
+ DEFAULT_PREFERENCE_TABLE,
+ DEFAULT_USER_ID_COLUMN,
+ DEFAULT_ITEM_ID_COLUMN,
+ DEFAULT_PREFERENCE_COLUMN,
+ getPreferenceSQL,
+ getPreferenceTimeSQL,
+ getUserSQL,
+ getAllUsersSQL,
+ getNumItemsSQL,
+ getNumUsersSQL,
+ setPreferenceSQL,
+ removePreferenceSQL,
+ getUsersSQL,
+ getItemsSQL,
+ getPrefsForItemSQL,
+ getNumPreferenceForItemSQL,
+ getNumPreferenceForItemsSQL,
+ getMaxPreferenceSQL,
+ getMinPreferenceSQL);
+ }
+
+ protected AbstractJDBCDataModel(DataSource dataSource,
+ String preferenceTable,
+ String userIDColumn,
+ String itemIDColumn,
+ String preferenceColumn,
+ String getPreferenceSQL,
+ String getPreferenceTimeSQL,
+ String getUserSQL,
+ String getAllUsersSQL,
+ String getNumItemsSQL,
+ String getNumUsersSQL,
+ String setPreferenceSQL,
+ String removePreferenceSQL,
+ String getUsersSQL,
+ String getItemsSQL,
+ String getPrefsForItemSQL,
+ String getNumPreferenceForItemSQL,
+ String getNumPreferenceForItemsSQL,
+ String getMaxPreferenceSQL,
+ String getMinPreferenceSQL) {
+
+ log.debug("Creating AbstractJDBCModel...");
+
+ AbstractJDBCComponent.checkNotNullAndLog("preferenceTable", preferenceTable);
+ AbstractJDBCComponent.checkNotNullAndLog("userIDColumn", userIDColumn);
+ AbstractJDBCComponent.checkNotNullAndLog("itemIDColumn", itemIDColumn);
+ AbstractJDBCComponent.checkNotNullAndLog("preferenceColumn", preferenceColumn);
+
+ AbstractJDBCComponent.checkNotNullAndLog("dataSource", dataSource);
+ AbstractJDBCComponent.checkNotNullAndLog("getUserSQL", getUserSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getAllUsersSQL", getAllUsersSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getPreferenceSQL", getPreferenceSQL);
+ // getPreferenceTimeSQL can be null
+ AbstractJDBCComponent.checkNotNullAndLog("getNumItemsSQL", getNumItemsSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getNumUsersSQL", getNumUsersSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("setPreferenceSQL", setPreferenceSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("removePreferenceSQL", removePreferenceSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getUsersSQL", getUsersSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getItemsSQL", getItemsSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getPrefsForItemSQL", getPrefsForItemSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getNumPreferenceForItemSQL", getNumPreferenceForItemSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getNumPreferenceForItemsSQL", getNumPreferenceForItemsSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getMaxPreferenceSQL", getMaxPreferenceSQL);
+ AbstractJDBCComponent.checkNotNullAndLog("getMinPreferenceSQL", getMinPreferenceSQL);
+
+ if (!(dataSource instanceof ConnectionPoolDataSource)) {
+ log.warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
+ + "to the database itself, or database performance will be severely reduced.");
+ }
+
+ this.preferenceTable = preferenceTable;
+ this.userIDColumn = userIDColumn;
+ this.itemIDColumn = itemIDColumn;
+ this.preferenceColumn = preferenceColumn;
+
+ this.dataSource = dataSource;
+ this.getPreferenceSQL = getPreferenceSQL;
+ this.getPreferenceTimeSQL = getPreferenceTimeSQL;
+ this.getUserSQL = getUserSQL;
+ this.getAllUsersSQL = getAllUsersSQL;
+ this.getNumItemsSQL = getNumItemsSQL;
+ this.getNumUsersSQL = getNumUsersSQL;
+ this.setPreferenceSQL = setPreferenceSQL;
+ this.removePreferenceSQL = removePreferenceSQL;
+ this.getUsersSQL = getUsersSQL;
+ this.getItemsSQL = getItemsSQL;
+ this.getPrefsForItemSQL = getPrefsForItemSQL;
+ //this.getNumPreferenceForItemSQL = getNumPreferenceForItemSQL;
+ this.getNumPreferenceForItemsSQL = getNumPreferenceForItemsSQL;
+ this.getMaxPreferenceSQL = getMaxPreferenceSQL;
+ this.getMinPreferenceSQL = getMinPreferenceSQL;
+
+ this.cachedNumUsers = -1;
+ this.cachedNumItems = -1;
+ this.itemPrefCounts = new Cache<>(new ItemPrefCountRetriever(getNumPreferenceForItemSQL));
+
+ this.maxPreference = Float.NaN;
+ this.minPreference = Float.NaN;
+ }
+
+ /** @return the {@link DataSource} that this instance is using */
+ @Override
+ public DataSource getDataSource() {
+ return dataSource;
+ }
+
+ public String getPreferenceTable() {
+ return preferenceTable;
+ }
+
+ public String getUserIDColumn() {
+ return userIDColumn;
+ }
+
+ public String getItemIDColumn() {
+ return itemIDColumn;
+ }
+
+ public String getPreferenceColumn() {
+ return preferenceColumn;
+ }
+
+ String getSetPreferenceSQL() {
+ return setPreferenceSQL;
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() throws TasteException {
+ log.debug("Retrieving all users...");
+ try {
+ return new ResultSetIDIterator(getUsersSQL);
+ } catch (SQLException sqle) {
+ throw new TasteException(sqle);
+ }
+ }
+
+ /**
+ * @throws NoSuchUserException
+ * if there is no such user
+ */
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+
+ log.debug("Retrieving user ID '{}'", userID);
+
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getUserSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+ setLongParameter(stmt, 1, userID);
+
+ log.debug("Executing SQL query: {}", getUserSQL);
+ rs = stmt.executeQuery();
+
+ List<Preference> prefs = new ArrayList<>();
+ while (rs.next()) {
+ prefs.add(buildPreference(rs));
+ }
+
+ if (prefs.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+
+ return new GenericUserPreferenceArray(prefs);
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving user", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+
+ }
+
+ @Override
+ public FastByIDMap<PreferenceArray> exportWithPrefs() throws TasteException {
+ log.debug("Exporting all data");
+
+ Connection conn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+
+ FastByIDMap<PreferenceArray> result = new FastByIDMap<>();
+
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+
+ log.debug("Executing SQL query: {}", getAllUsersSQL);
+ rs = stmt.executeQuery(getAllUsersSQL);
+
+ Long currentUserID = null;
+ List<Preference> currentPrefs = new ArrayList<>();
+ while (rs.next()) {
+ long nextUserID = getLongColumn(rs, 1);
+ if (currentUserID != null && !currentUserID.equals(nextUserID) && !currentPrefs.isEmpty()) {
+ result.put(currentUserID, new GenericUserPreferenceArray(currentPrefs));
+ currentPrefs.clear();
+ }
+ currentPrefs.add(buildPreference(rs));
+ currentUserID = nextUserID;
+ }
+ if (!currentPrefs.isEmpty()) {
+ result.put(currentUserID, new GenericUserPreferenceArray(currentPrefs));
+ }
+
+ return result;
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while exporting all data", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+
+ }
+ }
+
+ @Override
+ public FastByIDMap<FastIDSet> exportWithIDsOnly() throws TasteException {
+ log.debug("Exporting all data");
+
+ Connection conn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+
+ FastByIDMap<FastIDSet> result = new FastByIDMap<>();
+
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+
+ log.debug("Executing SQL query: {}", getAllUsersSQL);
+ rs = stmt.executeQuery(getAllUsersSQL);
+
+ boolean currentUserIDSet = false;
+ long currentUserID = 0L; // value isn't used
+ FastIDSet currentItemIDs = new FastIDSet(2);
+ while (rs.next()) {
+ long nextUserID = getLongColumn(rs, 1);
+ if (currentUserIDSet && currentUserID != nextUserID && !currentItemIDs.isEmpty()) {
+ result.put(currentUserID, currentItemIDs);
+ currentItemIDs = new FastIDSet(2);
+ }
+ currentItemIDs.add(getLongColumn(rs, 2));
+ currentUserID = nextUserID;
+ currentUserIDSet = true;
+ }
+ if (!currentItemIDs.isEmpty()) {
+ result.put(currentUserID, currentItemIDs);
+ }
+
+ return result;
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while exporting all data", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+
+ }
+ }
+
+ /**
+ * @throws NoSuchUserException
+ * if there is no such user
+ */
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+
+ log.debug("Retrieving items for user ID '{}'", userID);
+
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getUserSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+ setLongParameter(stmt, 1, userID);
+
+ log.debug("Executing SQL query: {}", getUserSQL);
+ rs = stmt.executeQuery();
+
+ FastIDSet result = new FastIDSet();
+ while (rs.next()) {
+ result.add(getLongColumn(rs, 2));
+ }
+
+ if (result.isEmpty()) {
+ throw new NoSuchUserException(userID);
+ }
+
+ return result;
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving item s", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ log.debug("Retrieving preferences for item ID '{}'", itemID);
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getPreferenceSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(1);
+ setLongParameter(stmt, 1, userID);
+ setLongParameter(stmt, 2, itemID);
+
+ log.debug("Executing SQL query: {}", getPreferenceSQL);
+ rs = stmt.executeQuery();
+ if (rs.next()) {
+ return rs.getFloat(1);
+ } else {
+ return null;
+ }
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving prefs for item", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ if (getPreferenceTimeSQL == null) {
+ return null;
+ }
+ log.debug("Retrieving preference time for item ID '{}'", itemID);
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getPreferenceTimeSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(1);
+ setLongParameter(stmt, 1, userID);
+ setLongParameter(stmt, 2, itemID);
+
+ log.debug("Executing SQL query: {}", getPreferenceTimeSQL);
+ rs = stmt.executeQuery();
+ if (rs.next()) {
+ return rs.getLong(1);
+ } else {
+ return null;
+ }
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving time for item", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() throws TasteException {
+ log.debug("Retrieving all items...");
+ try {
+ return new ResultSetIDIterator(getItemsSQL);
+ } catch (SQLException sqle) {
+ throw new TasteException(sqle);
+ }
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ List<Preference> list = doGetPreferencesForItem(itemID);
+ if (list.isEmpty()) {
+ throw new NoSuchItemException(itemID);
+ }
+ return new GenericItemPreferenceArray(list);
+ }
+
+ protected List<Preference> doGetPreferencesForItem(long itemID) throws TasteException {
+ log.debug("Retrieving preferences for item ID '{}'", itemID);
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getPrefsForItemSQL, ResultSet.TYPE_FORWARD_ONLY,
+ ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+ setLongParameter(stmt, 1, itemID);
+
+ log.debug("Executing SQL query: {}", getPrefsForItemSQL);
+ rs = stmt.executeQuery();
+ List<Preference> prefs = new ArrayList<>();
+ while (rs.next()) {
+ prefs.add(buildPreference(rs));
+ }
+ return prefs;
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving prefs for item", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ }
+
+ @Override
+ public int getNumItems() throws TasteException {
+ if (cachedNumItems < 0) {
+ cachedNumItems = getNumThings("items", getNumItemsSQL);
+ }
+ return cachedNumItems;
+ }
+
+ @Override
+ public int getNumUsers() throws TasteException {
+ if (cachedNumUsers < 0) {
+ cachedNumUsers = getNumThings("users", getNumUsersSQL);
+ }
+ return cachedNumUsers;
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ return itemPrefCounts.get(itemID);
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ return getNumThings("user preferring items", getNumPreferenceForItemsSQL, itemID1, itemID2);
+ }
+
+ private int getNumThings(String name, String sql, long... args) throws TasteException {
+ log.debug("Retrieving number of {} in model", name);
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(getFetchSize());
+ if (args != null) {
+ for (int i = 1; i <= args.length; i++) {
+ setLongParameter(stmt, i, args[i - 1]);
+ }
+ }
+ log.debug("Executing SQL query: {}", sql);
+ rs = stmt.executeQuery();
+ rs.next();
+ return rs.getInt(1);
+ } catch (SQLException sqle) {
+ log.warn("Exception while retrieving number of {}", name, sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+
+ log.debug("Setting preference for user {}, item {}", userID, itemID);
+
+ Connection conn = null;
+ PreparedStatement stmt = null;
+
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(setPreferenceSQL);
+ setLongParameter(stmt, 1, userID);
+ setLongParameter(stmt, 2, itemID);
+ stmt.setDouble(3, value);
+ stmt.setDouble(4, value);
+
+ log.debug("Executing SQL update: {}", setPreferenceSQL);
+ stmt.executeUpdate();
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while setting preference", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(null, stmt, conn);
+ }
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+
+ log.debug("Removing preference for user '{}', item '{}'", userID, itemID);
+
+ Connection conn = null;
+ PreparedStatement stmt = null;
+
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(removePreferenceSQL);
+ setLongParameter(stmt, 1, userID);
+ setLongParameter(stmt, 2, itemID);
+
+ log.debug("Executing SQL update: {}", removePreferenceSQL);
+ stmt.executeUpdate();
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while removing preference", sqle);
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(null, stmt, conn);
+ }
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ cachedNumUsers = -1;
+ cachedNumItems = -1;
+ minPreference = Float.NaN;
+ maxPreference = Float.NaN;
+ itemPrefCounts.clear();
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return true;
+ }
+
+ @Override
+ public float getMaxPreference() {
+ if (Float.isNaN(maxPreference)) {
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getMaxPreferenceSQL);
+
+ log.debug("Executing SQL query: {}", getMaxPreferenceSQL);
+ rs = stmt.executeQuery();
+ rs.next();
+ maxPreference = rs.getFloat(1);
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while removing preference", sqle);
+ // do nothing
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ }
+ return maxPreference;
+ }
+
+ @Override
+ public float getMinPreference() {
+ if (Float.isNaN(minPreference)) {
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getMinPreferenceSQL);
+
+ log.debug("Executing SQL query: {}", getMinPreferenceSQL);
+ rs = stmt.executeQuery();
+ rs.next();
+ minPreference = rs.getFloat(1);
+
+ } catch (SQLException sqle) {
+ log.warn("Exception while removing preference", sqle);
+ // do nothing
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ }
+ return minPreference;
+ }
+
+ // Some overrideable methods to customize the class behavior:
+
+ protected Preference buildPreference(ResultSet rs) throws SQLException {
+ return new GenericPreference(getLongColumn(rs, 1), getLongColumn(rs, 2), rs.getFloat(3));
+ }
+
+ /**
+ * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
+ * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
+ * translation.
+ */
+ protected long getLongColumn(ResultSet rs, int position) throws SQLException {
+ return rs.getLong(position);
+ }
+
+ /**
+ * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
+ * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
+ * translation.
+ */
+ protected void setLongParameter(PreparedStatement stmt, int position, long value) throws SQLException {
+ stmt.setLong(position, value);
+ }
+
+ /**
+ * 
+ * An {@link java.util.Iterator} which returns items from a {@link ResultSet}. This is a useful way to
+ * iterate over all user data since it does not require all data to be read into memory at once. It does
+ * however require that the DB connection be held open. Note that this class will only release database
+ * resources after {@link #hasNext()} has been called and has returned {@code false}; callers should
+ * make sure to "drain" the entire set of data to avoid tying up database resources.
+ * 
+ */
+ private final class ResultSetIDIterator extends ResultSetIterator<Long> implements LongPrimitiveIterator {
+
+ private ResultSetIDIterator(String sql) throws SQLException {
+ super(dataSource, sql);
+ }
+
+ @Override
+ protected Long parseElement(ResultSet resultSet) throws SQLException {
+ return getLongColumn(resultSet, 1);
+ }
+
+ @Override
+ public long nextLong() {
+ return next();
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public long peek() {
+ // This could be supported; is it worth it?
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private final class ItemPrefCountRetriever implements Retriever<Long,Integer> {
+ private final String getNumPreferenceForItemSQL;
+
+ private ItemPrefCountRetriever(String getNumPreferenceForItemSQL) {
+ this.getNumPreferenceForItemSQL = getNumPreferenceForItemSQL;
+ }
+
+ @Override
+ public Integer get(Long key) throws TasteException {
+ return getNumThings("user preferring item", getNumPreferenceForItemSQL, key);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java
new file mode 100644
index 0000000..ff7f661
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/ConnectionPoolDataSource.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import java.io.PrintWriter;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.SQLFeatureNotSupportedException;
+import java.util.logging.Logger;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.dbcp.ConnectionFactory;
+import org.apache.commons.dbcp.PoolableConnectionFactory;
+import org.apache.commons.dbcp.PoolingDataSource;
+import org.apache.commons.pool.impl.GenericObjectPool;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * 
+ * A wrapper {@link DataSource} which pools connections.
+ * 
+ */
+public final class ConnectionPoolDataSource implements DataSource {
+
+ private final DataSource delegate;
+
+ public ConnectionPoolDataSource(DataSource underlyingDataSource) {
+ Preconditions.checkNotNull(underlyingDataSource);
+ ConnectionFactory connectionFactory = new ConfiguringConnectionFactory(underlyingDataSource);
+ GenericObjectPool objectPool = new GenericObjectPool();
+ objectPool.setTestOnBorrow(false);
+ objectPool.setTestOnReturn(false);
+ objectPool.setTestWhileIdle(true);
+ objectPool.setTimeBetweenEvictionRunsMillis(60 * 1000L);
+ // Constructor actually sets itself as factory on pool
+ new PoolableConnectionFactory(connectionFactory, objectPool, null, "SELECT 1", false, false);
+ delegate = new PoolingDataSource(objectPool);
+ }
+
+ @Override
+ public Connection getConnection() throws SQLException {
+ return delegate.getConnection();
+ }
+
+ @Override
+ public Connection getConnection(String username, String password) throws SQLException {
+ return delegate.getConnection(username, password);
+ }
+
+ @Override
+ public PrintWriter getLogWriter() throws SQLException {
+ return delegate.getLogWriter();
+ }
+
+ @Override
+ public void setLogWriter(PrintWriter printWriter) throws SQLException {
+ delegate.setLogWriter(printWriter);
+ }
+
+ @Override
+ public void setLoginTimeout(int timeout) throws SQLException {
+ delegate.setLoginTimeout(timeout);
+ }
+
+ @Override
+ public int getLoginTimeout() throws SQLException {
+ return delegate.getLoginTimeout();
+ }
+
+ @Override
+ public <T> T unwrap(Class<T> iface) throws SQLException {
+ return delegate.unwrap(iface);
+ }
+
+ @Override
+ public boolean isWrapperFor(Class<?> iface) throws SQLException {
+ return delegate.isWrapperFor(iface);
+ }
+
+ // This exists for compatibility with Java 7 / JDBC 4.1, but doesn't exist
+ // in Java 6. In Java 7 it would @Override, but not in 6.
+ // @Override
+ public Logger getParentLogger() throws SQLFeatureNotSupportedException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ private static class ConfiguringConnectionFactory implements ConnectionFactory {
+
+ private final DataSource underlyingDataSource;
+
+ ConfiguringConnectionFactory(DataSource underlyingDataSource) {
+ this.underlyingDataSource = underlyingDataSource;
+ }
+
+ @Override
+ public Connection createConnection() throws SQLException {
+ Connection connection = underlyingDataSource.getConnection();
+ connection.setTransactionIsolation(Connection.TRANSACTION_READ_UNCOMMITTED);
+ connection.setHoldability(ResultSet.CLOSE_CURSORS_AT_COMMIT);
+ return connection;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java
new file mode 100644
index 0000000..5dd0be9
--- /dev/null
+++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/GenericJDBCDataModel.java
@@ -0,0 +1,146 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.jdbc;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Properties;
+
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.jdbc.AbstractJDBCComponent;
+
+/**
+ * 
+ * A generic {@link org.apache.mahout.cf.taste.model.DataModel} designed for use with other JDBC data sources;
+ * one just specifies all necessary SQL queries to the constructor here. Optionally, the queries can be
+ * specified from a {@link Properties} object, {@link File}, or {@link InputStream}. This class is most
+ * appropriate when other existing implementations of {@link AbstractJDBCDataModel} are not suitable. If you
+ * are using this class to support a major database, consider contributing a specialized implementation of
+ * {@link AbstractJDBCDataModel} to the project for this database.
+ * 
+ */
+public final class GenericJDBCDataModel extends AbstractJDBCDataModel {
+
+ public static final String DATA_SOURCE_KEY = "dataSource";
+ public static final String GET_PREFERENCE_SQL_KEY = "getPreferenceSQL";
+ public static final String GET_PREFERENCE_TIME_SQL_KEY = "getPreferenceTimeSQL";
+ public static final String GET_USER_SQL_KEY = "getUserSQL";
+ public static final String GET_ALL_USERS_SQL_KEY = "getAllUsersSQL";
+ public static final String GET_NUM_USERS_SQL_KEY = "getNumUsersSQL";
+ public static final String GET_NUM_ITEMS_SQL_KEY = "getNumItemsSQL";
+ public static final String SET_PREFERENCE_SQL_KEY = "setPreferenceSQL";
+ public static final String REMOVE_PREFERENCE_SQL_KEY = "removePreferenceSQL";
+ public static final String GET_USERS_SQL_KEY = "getUsersSQL";
+ public static final String GET_ITEMS_SQL_KEY = "getItemsSQL";
+ public static final String GET_PREFS_FOR_ITEM_SQL_KEY = "getPrefsForItemSQL";
+ public static final String GET_NUM_PREFERENCE_FOR_ITEM_KEY = "getNumPreferenceForItemSQL";
+ public static final String GET_NUM_PREFERENCE_FOR_ITEMS_KEY = "getNumPreferenceForItemsSQL";
+ public static final String GET_MAX_PREFERENCE_KEY = "getMaxPreferenceSQL";
+ public static final String GET_MIN_PREFERENCE_KEY = "getMinPreferenceSQL";
+
+ /**
+ * 
+ * Specifies all SQL queries in a {@link Properties} object. See the {@code *_KEY} constants in this
+ * class (e.g. {@link #GET_USER_SQL_KEY}) for a list of all keys which must map to a value in this object.
+ * 
+ *
+ * @param props
+ * {@link Properties} object containing values
+ * @throws TasteException
+ * if anything goes wrong during initialization
+ */
+ public GenericJDBCDataModel(Properties props) throws TasteException {
+ super(AbstractJDBCComponent.lookupDataSource(props.getProperty(DATA_SOURCE_KEY)),
+ props.getProperty(GET_PREFERENCE_SQL_KEY),
+ props.getProperty(GET_PREFERENCE_TIME_SQL_KEY),
+ props.getProperty(GET_USER_SQL_KEY),
+ props.getProperty(GET_ALL_USERS_SQL_KEY),
+ props.getProperty(GET_NUM_ITEMS_SQL_KEY),
+ props.getProperty(GET_NUM_USERS_SQL_KEY),
+ props.getProperty(SET_PREFERENCE_SQL_KEY),
+ props.getProperty(REMOVE_PREFERENCE_SQL_KEY),
+ props.getProperty(GET_USERS_SQL_KEY),
+ props.getProperty(GET_ITEMS_SQL_KEY),
+ props.getProperty(GET_PREFS_FOR_ITEM_SQL_KEY),
+ props.getProperty(GET_NUM_PREFERENCE_FOR_ITEM_KEY),
+ props.getProperty(GET_NUM_PREFERENCE_FOR_ITEMS_KEY),
+ props.getProperty(GET_MAX_PREFERENCE_KEY),
+ props.getProperty(GET_MIN_PREFERENCE_KEY));
+ }
+
+ /**
+ * 
+ * See {@link #GenericJDBCDataModel(Properties)}. This constructor reads values from a file
+ * instead, as if with {@link Properties#load(InputStream)}. So, the file should be in standard Java
+ * properties file format -- containing {@code key=value} pairs, one per line.
+ * 
+ *
+ * @param propertiesFile
+ * properties file
+ * @throws TasteException
+ * if anything goes wrong during initialization
+ */
+ public GenericJDBCDataModel(File propertiesFile) throws TasteException {
+ this(getPropertiesFromFile(propertiesFile));
+ }
+
+ /**
+ * 
+ * See {@link #GenericJDBCDataModel(Properties)}. This constructor reads values from a resource available in
+ * the classpath, as if with {@link Class#getResourceAsStream(String)} and
+ * {@link Properties#load(InputStream)}. This is useful if your configuration file is, for example, packaged
+ * in a JAR file that is in the classpath.
+ * 
+ *
+ * @param resourcePath
+ * path to resource in classpath (e.g. "/com/foo/TasteSQLQueries.properties")
+ * @throws TasteException
+ * if anything goes wrong during initialization
+ */
+ public GenericJDBCDataModel(String resourcePath) throws TasteException {
+ this(getPropertiesFromStream(GenericJDBCDataModel.class
+ .getResourceAsStream(resourcePath)));
+ }
+
+ private static Properties getPropertiesFromFile(File file) throws TasteException {
+ try {
+ return getPropertiesFromStream(new FileInputStream(file));
+ } catch (FileNotFoundException fnfe) {
+ throw new TasteException(fnfe);
+ }
+ }
+
+ private static Properties getPropertiesFromStream(InputStream is) throws TasteException {
+ try {
+ try {
+ Properties props = new Properties();
+ props.load(is);
+ return props;
+ } finally {
+ Closeables.close(is, true);
+ }
+ } catch (IOException ioe) {
+ throw new TasteException(ioe);
+ }
+ }
+
+}